@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +1 -1
  21. package/src/LlamaContext.cpp +81 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -6,28 +6,28 @@
6
6
  #include <clocale>
7
7
  #include <cmath>
8
8
  #include <cstdio>
9
+ #include <cstdlib>
9
10
  #include <cstring>
10
11
  #include <ctime>
11
- #include <cstdlib>
12
12
  #include <iterator>
13
13
  #include <map>
14
14
  #include <numeric>
15
15
  #include <regex>
16
16
  #include <sstream>
17
17
  #include <string>
18
- #include <vector>
19
18
  #include <thread>
19
+ #include <vector>
20
20
 
21
+ #include "common.h"
21
22
  #include "ggml.h"
22
23
  #include "llama.h"
23
- #include "common.h"
24
24
 
25
25
  #ifdef _WIN32
26
- #define WIN32_LEAN_AND_MEAN
27
- #ifndef NOMINMAX
28
- # define NOMINMAX
29
- #endif
30
- #include <windows.h>
26
+ # define WIN32_LEAN_AND_MEAN
27
+ # ifndef NOMINMAX
28
+ # define NOMINMAX
29
+ # endif
30
+ # include <windows.h>
31
31
  #endif
32
32
 
33
33
  // utils
@@ -36,8 +36,7 @@ static uint64_t get_time_ns() {
36
36
  return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
37
37
  }
38
38
 
39
- template<class T>
40
- static std::string join(const std::vector<T> & values, const std::string & delim) {
39
+ template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
41
40
  std::ostringstream str;
42
41
  for (size_t i = 0; i < values.size(); i++) {
43
42
  str << values[i];
@@ -48,38 +47,35 @@ static std::string join(const std::vector<T> & values, const std::string & delim
48
47
  return str.str();
49
48
  }
50
49
 
51
- template<typename T, typename F>
52
- static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
50
+ template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
53
51
  std::vector<std::string> str_values;
54
52
  std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
55
53
  return str_values;
56
54
  }
57
55
 
58
- template<typename T>
59
- static T avg(const std::vector<T> & v) {
56
+ template <typename T> static T avg(const std::vector<T> & v) {
60
57
  if (v.empty()) {
61
58
  return 0;
62
59
  }
63
60
  T sum = std::accumulate(v.begin(), v.end(), T(0));
64
- return sum / (T)v.size();
61
+ return sum / (T) v.size();
65
62
  }
66
63
 
67
- template<typename T>
68
- static T stdev(const std::vector<T> & v) {
64
+ template <typename T> static T stdev(const std::vector<T> & v) {
69
65
  if (v.size() <= 1) {
70
66
  return 0;
71
67
  }
72
- T mean = avg(v);
68
+ T mean = avg(v);
73
69
  T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
74
- T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
70
+ T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
75
71
  return stdev;
76
72
  }
77
73
 
78
74
  static std::string get_cpu_info() {
79
75
  std::vector<std::string> cpu_list;
80
76
  for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
81
- auto * dev = ggml_backend_dev_get(i);
82
- auto dev_type = ggml_backend_dev_type(dev);
77
+ auto * dev = ggml_backend_dev_get(i);
78
+ auto dev_type = ggml_backend_dev_type(dev);
83
79
  if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
84
80
  cpu_list.push_back(ggml_backend_dev_description(dev));
85
81
  }
@@ -90,8 +86,8 @@ static std::string get_cpu_info() {
90
86
  static std::string get_gpu_info() {
91
87
  std::vector<std::string> gpu_list;
92
88
  for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
93
- auto * dev = ggml_backend_dev_get(i);
94
- auto dev_type = ggml_backend_dev_type(dev);
89
+ auto * dev = ggml_backend_dev_get(i);
90
+ auto dev_type = ggml_backend_dev_type(dev);
95
91
  if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
96
92
  gpu_list.push_back(ggml_backend_dev_description(dev));
97
93
  }
@@ -100,17 +96,24 @@ static std::string get_gpu_info() {
100
96
  }
101
97
 
102
98
  // command line params
103
- enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
99
+ enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
104
100
 
105
101
  static const char * output_format_str(output_formats format) {
106
102
  switch (format) {
107
- case NONE: return "none";
108
- case CSV: return "csv";
109
- case JSON: return "json";
110
- case JSONL: return "jsonl";
111
- case MARKDOWN: return "md";
112
- case SQL: return "sql";
113
- default: GGML_ABORT("invalid output format");
103
+ case NONE:
104
+ return "none";
105
+ case CSV:
106
+ return "csv";
107
+ case JSON:
108
+ return "json";
109
+ case JSONL:
110
+ return "jsonl";
111
+ case MARKDOWN:
112
+ return "md";
113
+ case SQL:
114
+ return "sql";
115
+ default:
116
+ GGML_ABORT("invalid output format");
114
117
  }
115
118
  }
116
119
 
@@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
135
138
 
136
139
  static const char * split_mode_str(llama_split_mode mode) {
137
140
  switch (mode) {
138
- case LLAMA_SPLIT_MODE_NONE: return "none";
139
- case LLAMA_SPLIT_MODE_LAYER: return "layer";
140
- case LLAMA_SPLIT_MODE_ROW: return "row";
141
- default: GGML_ABORT("invalid split mode");
141
+ case LLAMA_SPLIT_MODE_NONE:
142
+ return "none";
143
+ case LLAMA_SPLIT_MODE_LAYER:
144
+ return "layer";
145
+ case LLAMA_SPLIT_MODE_ROW:
146
+ return "row";
147
+ default:
148
+ GGML_ABORT("invalid split mode");
142
149
  }
143
150
  }
144
151
 
@@ -149,59 +156,59 @@ static std::string pair_str(const std::pair<int, int> & p) {
149
156
  }
150
157
 
151
158
  struct cmd_params {
152
- std::vector<std::string> model;
153
- std::vector<int> n_prompt;
154
- std::vector<int> n_gen;
159
+ std::vector<std::string> model;
160
+ std::vector<int> n_prompt;
161
+ std::vector<int> n_gen;
155
162
  std::vector<std::pair<int, int>> n_pg;
156
- std::vector<int> n_batch;
157
- std::vector<int> n_ubatch;
158
- std::vector<ggml_type> type_k;
159
- std::vector<ggml_type> type_v;
160
- std::vector<int> n_threads;
161
- std::vector<std::string> cpu_mask;
162
- std::vector<bool> cpu_strict;
163
- std::vector<int> poll;
164
- std::vector<int> n_gpu_layers;
165
- std::vector<std::string> rpc_servers;
166
- std::vector<llama_split_mode> split_mode;
167
- std::vector<int> main_gpu;
168
- std::vector<bool> no_kv_offload;
169
- std::vector<bool> flash_attn;
170
- std::vector<std::vector<float>> tensor_split;
171
- std::vector<bool> use_mmap;
172
- std::vector<bool> embeddings;
173
- ggml_numa_strategy numa;
174
- int reps;
175
- ggml_sched_priority prio;
176
- int delay;
177
- bool verbose;
178
- bool progress;
179
- output_formats output_format;
180
- output_formats output_format_stderr;
163
+ std::vector<int> n_batch;
164
+ std::vector<int> n_ubatch;
165
+ std::vector<ggml_type> type_k;
166
+ std::vector<ggml_type> type_v;
167
+ std::vector<int> n_threads;
168
+ std::vector<std::string> cpu_mask;
169
+ std::vector<bool> cpu_strict;
170
+ std::vector<int> poll;
171
+ std::vector<int> n_gpu_layers;
172
+ std::vector<std::string> rpc_servers;
173
+ std::vector<llama_split_mode> split_mode;
174
+ std::vector<int> main_gpu;
175
+ std::vector<bool> no_kv_offload;
176
+ std::vector<bool> flash_attn;
177
+ std::vector<std::vector<float>> tensor_split;
178
+ std::vector<bool> use_mmap;
179
+ std::vector<bool> embeddings;
180
+ ggml_numa_strategy numa;
181
+ int reps;
182
+ ggml_sched_priority prio;
183
+ int delay;
184
+ bool verbose;
185
+ bool progress;
186
+ output_formats output_format;
187
+ output_formats output_format_stderr;
181
188
  };
182
189
 
183
190
  static const cmd_params cmd_params_defaults = {
184
- /* model */ {"models/7B/ggml-model-q4_0.gguf"},
185
- /* n_prompt */ {512},
186
- /* n_gen */ {128},
191
+ /* model */ { "models/7B/ggml-model-q4_0.gguf" },
192
+ /* n_prompt */ { 512 },
193
+ /* n_gen */ { 128 },
187
194
  /* n_pg */ {},
188
- /* n_batch */ {2048},
189
- /* n_ubatch */ {512},
190
- /* type_k */ {GGML_TYPE_F16},
191
- /* type_v */ {GGML_TYPE_F16},
192
- /* n_threads */ {cpu_get_num_math()},
193
- /* cpu_mask */ {"0x0"},
194
- /* cpu_strict */ {false},
195
- /* poll */ {50},
196
- /* n_gpu_layers */ {99},
197
- /* rpc_servers */ {""},
198
- /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
199
- /* main_gpu */ {0},
200
- /* no_kv_offload */ {false},
201
- /* flash_attn */ {false},
202
- /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
203
- /* use_mmap */ {true},
204
- /* embeddings */ {false},
195
+ /* n_batch */ { 2048 },
196
+ /* n_ubatch */ { 512 },
197
+ /* type_k */ { GGML_TYPE_F16 },
198
+ /* type_v */ { GGML_TYPE_F16 },
199
+ /* n_threads */ { cpu_get_num_math() },
200
+ /* cpu_mask */ { "0x0" },
201
+ /* cpu_strict */ { false },
202
+ /* poll */ { 50 },
203
+ /* n_gpu_layers */ { 99 },
204
+ /* rpc_servers */ { "" },
205
+ /* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
206
+ /* main_gpu */ { 0 },
207
+ /* no_kv_offload */ { false },
208
+ /* flash_attn */ { false },
209
+ /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
210
+ /* use_mmap */ { true },
211
+ /* embeddings */ { false },
205
212
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
206
213
  /* reps */ 5,
207
214
  /* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) {
218
225
  printf("options:\n");
219
226
  printf(" -h, --help\n");
220
227
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
221
- printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
228
+ printf(" -p, --n-prompt <n> (default: %s)\n",
229
+ join(cmd_params_defaults.n_prompt, ",").c_str());
222
230
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
223
- printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
224
- printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
225
- printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
226
- printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
227
- printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
228
- printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
229
- printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
230
- printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
231
+ printf(" -pg <pp,tg> (default: %s)\n",
232
+ join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
233
+ printf(" -b, --batch-size <n> (default: %s)\n",
234
+ join(cmd_params_defaults.n_batch, ",").c_str());
235
+ printf(" -ub, --ubatch-size <n> (default: %s)\n",
236
+ join(cmd_params_defaults.n_ubatch, ",").c_str());
237
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n",
238
+ join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
239
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n",
240
+ join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
241
+ printf(" -t, --threads <n> (default: %s)\n",
242
+ join(cmd_params_defaults.n_threads, ",").c_str());
243
+ printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
244
+ join(cmd_params_defaults.cpu_mask, ",").c_str());
245
+ printf(" --cpu-strict <0|1> (default: %s)\n",
246
+ join(cmd_params_defaults.cpu_strict, ",").c_str());
231
247
  printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
232
- printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
248
+ printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
249
+ join(cmd_params_defaults.n_gpu_layers, ",").c_str());
233
250
  if (llama_supports_rpc()) {
234
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
251
+ printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
252
+ join(cmd_params_defaults.rpc_servers, ",").c_str());
235
253
  }
236
- printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
237
- printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
238
- printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
239
- printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
240
- printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
254
+ printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
255
+ join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
256
+ printf(" -mg, --main-gpu <i> (default: %s)\n",
257
+ join(cmd_params_defaults.main_gpu, ",").c_str());
258
+ printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
259
+ join(cmd_params_defaults.no_kv_offload, ",").c_str());
260
+ printf(" -fa, --flash-attn <0|1> (default: %s)\n",
261
+ join(cmd_params_defaults.flash_attn, ",").c_str());
262
+ printf(" -mmp, --mmap <0|1> (default: %s)\n",
263
+ join(cmd_params_defaults.use_mmap, ",").c_str());
241
264
  printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
242
- printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
265
+ printf(" -embd, --embeddings <0|1> (default: %s)\n",
266
+ join(cmd_params_defaults.embeddings, ",").c_str());
243
267
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
244
268
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
245
269
  printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
246
270
  printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
247
- printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
248
- printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
271
+ printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
272
+ output_format_str(cmd_params_defaults.output_format));
273
+ printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
274
+ output_format_str(cmd_params_defaults.output_format_stderr));
249
275
  printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
250
276
  printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
251
277
  printf("\n");
252
- printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
278
+ printf(
279
+ "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
280
+ "multiple times.\n");
253
281
  }
254
282
 
255
283
  static ggml_type ggml_type_from_name(const std::string & s) {
@@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) {
281
309
  return GGML_TYPE_COUNT;
282
310
  }
283
311
 
284
-
285
312
  static cmd_params parse_cmd_params(int argc, char ** argv) {
286
- cmd_params params;
287
- std::string arg;
288
- bool invalid_param = false;
289
- const std::string arg_prefix = "--";
290
- const char split_delim = ',';
291
-
292
- params.verbose = cmd_params_defaults.verbose;
293
- params.output_format = cmd_params_defaults.output_format;
313
+ cmd_params params;
314
+ std::string arg;
315
+ bool invalid_param = false;
316
+ const std::string arg_prefix = "--";
317
+ const char split_delim = ',';
318
+
319
+ params.verbose = cmd_params_defaults.verbose;
320
+ params.output_format = cmd_params_defaults.output_format;
294
321
  params.output_format_stderr = cmd_params_defaults.output_format_stderr;
295
- params.reps = cmd_params_defaults.reps;
296
- params.numa = cmd_params_defaults.numa;
297
- params.prio = cmd_params_defaults.prio;
298
- params.delay = cmd_params_defaults.delay;
299
- params.progress = cmd_params_defaults.progress;
322
+ params.reps = cmd_params_defaults.reps;
323
+ params.numa = cmd_params_defaults.numa;
324
+ params.prio = cmd_params_defaults.prio;
325
+ params.delay = cmd_params_defaults.delay;
326
+ params.progress = cmd_params_defaults.progress;
300
327
 
301
328
  for (int i = 1; i < argc; i++) {
302
329
  arg = argv[i];
@@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
338
365
  invalid_param = true;
339
366
  break;
340
367
  }
341
- params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
368
+ params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
342
369
  } else if (arg == "-b" || arg == "--batch-size") {
343
370
  if (++i >= argc) {
344
371
  invalid_param = true;
@@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
358
385
  invalid_param = true;
359
386
  break;
360
387
  }
361
- auto p = string_split<std::string>(argv[i], split_delim);
388
+ auto p = string_split<std::string>(argv[i], split_delim);
362
389
  std::vector<ggml_type> types;
363
390
  for (const auto & t : p) {
364
391
  ggml_type gt = ggml_type_from_name(t);
@@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
377
404
  invalid_param = true;
378
405
  break;
379
406
  }
380
- auto p = string_split<std::string>(argv[i], split_delim);
407
+ auto p = string_split<std::string>(argv[i], split_delim);
381
408
  std::vector<ggml_type> types;
382
409
  for (const auto & t : p) {
383
410
  ggml_type gt = ggml_type_from_name(t);
@@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
437
464
  invalid_param = true;
438
465
  break;
439
466
  }
440
- auto p = string_split<std::string>(argv[i], split_delim);
467
+ auto p = string_split<std::string>(argv[i], split_delim);
441
468
  std::vector<llama_split_mode> modes;
442
469
  for (const auto & m : p) {
443
470
  llama_split_mode mode;
@@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
476
503
  break;
477
504
  } else {
478
505
  std::string value(argv[i]);
479
- /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
480
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
481
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
482
- else { invalid_param = true; break; }
506
+ /**/ if (value == "distribute" || value == "") {
507
+ params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
508
+ } else if (value == "isolate") {
509
+ params.numa = GGML_NUMA_STRATEGY_ISOLATE;
510
+ } else if (value == "numactl") {
511
+ params.numa = GGML_NUMA_STRATEGY_NUMACTL;
512
+ } else {
513
+ invalid_param = true;
514
+ break;
515
+ }
483
516
  }
484
517
  } else if (arg == "-fa" || arg == "--flash-attn") {
485
518
  if (++i >= argc) {
@@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
509
542
  }
510
543
  for (auto ts : string_split<std::string>(argv[i], split_delim)) {
511
544
  // split string by ; and /
512
- const std::regex regex{R"([;/]+)"};
513
- std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
514
- std::vector<std::string> split_arg{it, {}};
545
+ const std::regex regex{ R"([;/]+)" };
546
+ std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
547
+ std::vector<std::string> split_arg{ it, {} };
515
548
  GGML_ASSERT(split_arg.size() <= llama_max_devices());
516
549
 
517
550
  std::vector<float> tensor_split(llama_max_devices());
@@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
570
603
  }
571
604
 
572
605
  // set defaults
573
- if (params.model.empty()) { params.model = cmd_params_defaults.model; }
574
- if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
575
- if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
576
- if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
577
- if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
578
- if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
579
- if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
580
- if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
581
- if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
582
- if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
583
- if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
584
- if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
585
- if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
586
- if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
587
- if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
588
- if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
589
- if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
590
- if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
591
- if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
592
- if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
593
- if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
606
+ if (params.model.empty()) {
607
+ params.model = cmd_params_defaults.model;
608
+ }
609
+ if (params.n_prompt.empty()) {
610
+ params.n_prompt = cmd_params_defaults.n_prompt;
611
+ }
612
+ if (params.n_gen.empty()) {
613
+ params.n_gen = cmd_params_defaults.n_gen;
614
+ }
615
+ if (params.n_pg.empty()) {
616
+ params.n_pg = cmd_params_defaults.n_pg;
617
+ }
618
+ if (params.n_batch.empty()) {
619
+ params.n_batch = cmd_params_defaults.n_batch;
620
+ }
621
+ if (params.n_ubatch.empty()) {
622
+ params.n_ubatch = cmd_params_defaults.n_ubatch;
623
+ }
624
+ if (params.type_k.empty()) {
625
+ params.type_k = cmd_params_defaults.type_k;
626
+ }
627
+ if (params.type_v.empty()) {
628
+ params.type_v = cmd_params_defaults.type_v;
629
+ }
630
+ if (params.n_gpu_layers.empty()) {
631
+ params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
632
+ }
633
+ if (params.rpc_servers.empty()) {
634
+ params.rpc_servers = cmd_params_defaults.rpc_servers;
635
+ }
636
+ if (params.split_mode.empty()) {
637
+ params.split_mode = cmd_params_defaults.split_mode;
638
+ }
639
+ if (params.main_gpu.empty()) {
640
+ params.main_gpu = cmd_params_defaults.main_gpu;
641
+ }
642
+ if (params.no_kv_offload.empty()) {
643
+ params.no_kv_offload = cmd_params_defaults.no_kv_offload;
644
+ }
645
+ if (params.flash_attn.empty()) {
646
+ params.flash_attn = cmd_params_defaults.flash_attn;
647
+ }
648
+ if (params.tensor_split.empty()) {
649
+ params.tensor_split = cmd_params_defaults.tensor_split;
650
+ }
651
+ if (params.use_mmap.empty()) {
652
+ params.use_mmap = cmd_params_defaults.use_mmap;
653
+ }
654
+ if (params.embeddings.empty()) {
655
+ params.embeddings = cmd_params_defaults.embeddings;
656
+ }
657
+ if (params.n_threads.empty()) {
658
+ params.n_threads = cmd_params_defaults.n_threads;
659
+ }
660
+ if (params.cpu_mask.empty()) {
661
+ params.cpu_mask = cmd_params_defaults.cpu_mask;
662
+ }
663
+ if (params.cpu_strict.empty()) {
664
+ params.cpu_strict = cmd_params_defaults.cpu_strict;
665
+ }
666
+ if (params.poll.empty()) {
667
+ params.poll = cmd_params_defaults.poll;
668
+ }
594
669
 
595
670
  return params;
596
671
  }
597
672
 
598
673
  struct cmd_params_instance {
599
- std::string model;
600
- int n_prompt;
601
- int n_gen;
602
- int n_batch;
603
- int n_ubatch;
604
- ggml_type type_k;
605
- ggml_type type_v;
606
- int n_threads;
607
- std::string cpu_mask;
608
- bool cpu_strict;
609
- int poll;
610
- int n_gpu_layers;
611
- std::string rpc_servers;
612
- llama_split_mode split_mode;
613
- int main_gpu;
614
- bool no_kv_offload;
615
- bool flash_attn;
674
+ std::string model;
675
+ int n_prompt;
676
+ int n_gen;
677
+ int n_batch;
678
+ int n_ubatch;
679
+ ggml_type type_k;
680
+ ggml_type type_v;
681
+ int n_threads;
682
+ std::string cpu_mask;
683
+ bool cpu_strict;
684
+ int poll;
685
+ int n_gpu_layers;
686
+ std::string rpc_servers;
687
+ llama_split_mode split_mode;
688
+ int main_gpu;
689
+ bool no_kv_offload;
690
+ bool flash_attn;
616
691
  std::vector<float> tensor_split;
617
- bool use_mmap;
618
- bool embeddings;
692
+ bool use_mmap;
693
+ bool embeddings;
619
694
 
620
695
  llama_model_params to_llama_mparams() const {
621
696
  llama_model_params mparams = llama_model_default_params();
@@ -624,35 +699,31 @@ struct cmd_params_instance {
624
699
  if (!rpc_servers.empty()) {
625
700
  mparams.rpc_servers = rpc_servers.c_str();
626
701
  }
627
- mparams.split_mode = split_mode;
628
- mparams.main_gpu = main_gpu;
702
+ mparams.split_mode = split_mode;
703
+ mparams.main_gpu = main_gpu;
629
704
  mparams.tensor_split = tensor_split.data();
630
- mparams.use_mmap = use_mmap;
705
+ mparams.use_mmap = use_mmap;
631
706
 
632
707
  return mparams;
633
708
  }
634
709
 
635
710
  bool equal_mparams(const cmd_params_instance & other) const {
636
- return model == other.model &&
637
- n_gpu_layers == other.n_gpu_layers &&
638
- rpc_servers == other.rpc_servers &&
639
- split_mode == other.split_mode &&
640
- main_gpu == other.main_gpu &&
641
- use_mmap == other.use_mmap &&
711
+ return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
712
+ split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
642
713
  tensor_split == other.tensor_split;
643
714
  }
644
715
 
645
716
  llama_context_params to_llama_cparams() const {
646
717
  llama_context_params cparams = llama_context_default_params();
647
718
 
648
- cparams.n_ctx = n_prompt + n_gen;
649
- cparams.n_batch = n_batch;
650
- cparams.n_ubatch = n_ubatch;
651
- cparams.type_k = type_k;
652
- cparams.type_v = type_v;
719
+ cparams.n_ctx = n_prompt + n_gen;
720
+ cparams.n_batch = n_batch;
721
+ cparams.n_ubatch = n_ubatch;
722
+ cparams.type_k = type_k;
723
+ cparams.type_v = type_v;
653
724
  cparams.offload_kqv = !no_kv_offload;
654
- cparams.flash_attn = flash_attn;
655
- cparams.embeddings = embeddings;
725
+ cparams.flash_attn = flash_attn;
726
+ cparams.embeddings = embeddings;
656
727
 
657
728
  return cparams;
658
729
  }
@@ -662,6 +733,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
662
733
  std::vector<cmd_params_instance> instances;
663
734
 
664
735
  // this ordering minimizes the number of times that each model needs to be reloaded
736
+ // clang-format off
665
737
  for (const auto & m : params.model)
666
738
  for (const auto & nl : params.n_gpu_layers)
667
739
  for (const auto & rpc : params.rpc_servers)
@@ -767,100 +839,94 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
767
839
  instances.push_back(instance);
768
840
  }
769
841
  }
842
+ // clang-format on
770
843
 
771
844
  return instances;
772
845
  }
773
846
 
774
847
  struct test {
775
848
  static const std::string build_commit;
776
- static const int build_number;
849
+ static const int build_number;
777
850
  static const std::string cpu_info;
778
851
  static const std::string gpu_info;
779
- std::string model_filename;
780
- std::string model_type;
781
- uint64_t model_size;
782
- uint64_t model_n_params;
783
- int n_batch;
784
- int n_ubatch;
785
- int n_threads;
786
- std::string cpu_mask;
787
- bool cpu_strict;
788
- int poll;
789
- ggml_type type_k;
790
- ggml_type type_v;
791
- int n_gpu_layers;
792
- llama_split_mode split_mode;
793
- int main_gpu;
794
- bool no_kv_offload;
795
- bool flash_attn;
796
- std::vector<float> tensor_split;
797
- bool use_mmap;
798
- bool embeddings;
799
- int n_prompt;
800
- int n_gen;
801
- std::string test_time;
802
- std::vector<uint64_t> samples_ns;
852
+ std::string model_filename;
853
+ std::string model_type;
854
+ uint64_t model_size;
855
+ uint64_t model_n_params;
856
+ int n_batch;
857
+ int n_ubatch;
858
+ int n_threads;
859
+ std::string cpu_mask;
860
+ bool cpu_strict;
861
+ int poll;
862
+ ggml_type type_k;
863
+ ggml_type type_v;
864
+ int n_gpu_layers;
865
+ llama_split_mode split_mode;
866
+ int main_gpu;
867
+ bool no_kv_offload;
868
+ bool flash_attn;
869
+ std::vector<float> tensor_split;
870
+ bool use_mmap;
871
+ bool embeddings;
872
+ int n_prompt;
873
+ int n_gen;
874
+ std::string test_time;
875
+ std::vector<uint64_t> samples_ns;
803
876
 
804
877
  test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
805
878
  model_filename = inst.model;
806
879
  char buf[128];
807
880
  llama_model_desc(lmodel, buf, sizeof(buf));
808
- model_type = buf;
809
- model_size = llama_model_size(lmodel);
881
+ model_type = buf;
882
+ model_size = llama_model_size(lmodel);
810
883
  model_n_params = llama_model_n_params(lmodel);
811
- n_batch = inst.n_batch;
812
- n_ubatch = inst.n_ubatch;
813
- n_threads = inst.n_threads;
814
- cpu_mask = inst.cpu_mask;
815
- cpu_strict = inst.cpu_strict;
816
- poll = inst.poll;
817
- type_k = inst.type_k;
818
- type_v = inst.type_v;
819
- n_gpu_layers = inst.n_gpu_layers;
820
- split_mode = inst.split_mode;
821
- main_gpu = inst.main_gpu;
822
- no_kv_offload = inst.no_kv_offload;
823
- flash_attn = inst.flash_attn;
824
- tensor_split = inst.tensor_split;
825
- use_mmap = inst.use_mmap;
826
- embeddings = inst.embeddings;
827
- n_prompt = inst.n_prompt;
828
- n_gen = inst.n_gen;
884
+ n_batch = inst.n_batch;
885
+ n_ubatch = inst.n_ubatch;
886
+ n_threads = inst.n_threads;
887
+ cpu_mask = inst.cpu_mask;
888
+ cpu_strict = inst.cpu_strict;
889
+ poll = inst.poll;
890
+ type_k = inst.type_k;
891
+ type_v = inst.type_v;
892
+ n_gpu_layers = inst.n_gpu_layers;
893
+ split_mode = inst.split_mode;
894
+ main_gpu = inst.main_gpu;
895
+ no_kv_offload = inst.no_kv_offload;
896
+ flash_attn = inst.flash_attn;
897
+ tensor_split = inst.tensor_split;
898
+ use_mmap = inst.use_mmap;
899
+ embeddings = inst.embeddings;
900
+ n_prompt = inst.n_prompt;
901
+ n_gen = inst.n_gen;
829
902
  // RFC 3339 date-time format
830
- time_t t = time(NULL);
903
+ time_t t = time(NULL);
831
904
  std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
832
905
  test_time = buf;
833
906
 
834
907
  (void) ctx;
835
908
  }
836
909
 
837
- uint64_t avg_ns() const {
838
- return ::avg(samples_ns);
839
- }
910
+ uint64_t avg_ns() const { return ::avg(samples_ns); }
840
911
 
841
- uint64_t stdev_ns() const {
842
- return ::stdev(samples_ns);
843
- }
912
+ uint64_t stdev_ns() const { return ::stdev(samples_ns); }
844
913
 
845
914
  std::vector<double> get_ts() const {
846
- int n_tokens = n_prompt + n_gen;
915
+ int n_tokens = n_prompt + n_gen;
847
916
  std::vector<double> ts;
848
- std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
917
+ std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
918
+ [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
849
919
  return ts;
850
920
  }
851
921
 
852
- double avg_ts() const {
853
- return ::avg(get_ts());
854
- }
922
+ double avg_ts() const { return ::avg(get_ts()); }
855
923
 
856
- double stdev_ts() const {
857
- return ::stdev(get_ts());
858
- }
924
+ double stdev_ts() const { return ::stdev(get_ts()); }
859
925
 
860
926
  static std::string get_backend() {
861
927
  std::vector<std::string> backends;
862
928
  for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
863
- auto * reg = ggml_backend_reg_get(i);
929
+ auto * reg = ggml_backend_reg_get(i);
864
930
  std::string name = ggml_backend_reg_name(reg);
865
931
  if (name != "CPU") {
866
932
  backends.push_back(ggml_backend_reg_name(reg));
@@ -871,36 +937,27 @@ struct test {
871
937
 
872
938
  static const std::vector<std::string> & get_fields() {
873
939
  static const std::vector<std::string> fields = {
874
- "build_commit", "build_number",
875
- "cpu_info", "gpu_info", "backends",
876
- "model_filename", "model_type", "model_size", "model_n_params",
877
- "n_batch", "n_ubatch",
878
- "n_threads", "cpu_mask", "cpu_strict", "poll",
879
- "type_k", "type_v",
880
- "n_gpu_layers", "split_mode",
881
- "main_gpu", "no_kv_offload", "flash_attn",
882
- "tensor_split", "use_mmap", "embeddings",
883
- "n_prompt", "n_gen", "test_time",
884
- "avg_ns", "stddev_ns",
885
- "avg_ts", "stddev_ts",
940
+ "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
941
+ "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
942
+ "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
943
+ "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
944
+ "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
945
+ "avg_ts", "stddev_ts",
886
946
  };
887
947
  return fields;
888
948
  }
889
949
 
890
- enum field_type {STRING, BOOL, INT, FLOAT};
950
+ enum field_type { STRING, BOOL, INT, FLOAT };
891
951
 
892
952
  static field_type get_field_type(const std::string & field) {
893
- if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
894
- field == "n_threads" || field == "poll" ||
895
- field == "model_size" || field == "model_n_params" ||
896
- field == "n_gpu_layers" || field == "main_gpu" ||
897
- field == "n_prompt" || field == "n_gen" ||
898
- field == "avg_ns" || field == "stddev_ns") {
953
+ if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
954
+ field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
955
+ field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
956
+ field == "stddev_ns") {
899
957
  return INT;
900
958
  }
901
- if (field == "f16_kv" || field == "no_kv_offload" ||
902
- field == "cpu_strict" ||
903
- field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
959
+ if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
960
+ field == "use_mmap" || field == "embeddings") {
904
961
  return BOOL;
905
962
  }
906
963
  if (field == "avg_ts" || field == "stddev_ts") {
@@ -911,7 +968,7 @@ struct test {
911
968
 
912
969
  std::vector<std::string> get_values() const {
913
970
  std::string tensor_split_str;
914
- int max_nonzero = 0;
971
+ int max_nonzero = 0;
915
972
  for (size_t i = 0; i < llama_max_devices(); i++) {
916
973
  if (tensor_split[i] > 0) {
917
974
  max_nonzero = i;
@@ -925,29 +982,47 @@ struct test {
925
982
  tensor_split_str += "/";
926
983
  }
927
984
  }
928
- std::vector<std::string> values = {
929
- build_commit, std::to_string(build_number),
930
- cpu_info, gpu_info, get_backend(),
931
- model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
932
- std::to_string(n_batch), std::to_string(n_ubatch),
933
- std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
934
- ggml_type_name(type_k), ggml_type_name(type_v),
935
- std::to_string(n_gpu_layers), split_mode_str(split_mode),
936
- std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
937
- tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
938
- std::to_string(n_prompt), std::to_string(n_gen), test_time,
939
- std::to_string(avg_ns()), std::to_string(stdev_ns()),
940
- std::to_string(avg_ts()), std::to_string(stdev_ts())
941
- };
985
+ std::vector<std::string> values = { build_commit,
986
+ std::to_string(build_number),
987
+ cpu_info,
988
+ gpu_info,
989
+ get_backend(),
990
+ model_filename,
991
+ model_type,
992
+ std::to_string(model_size),
993
+ std::to_string(model_n_params),
994
+ std::to_string(n_batch),
995
+ std::to_string(n_ubatch),
996
+ std::to_string(n_threads),
997
+ cpu_mask,
998
+ std::to_string(cpu_strict),
999
+ std::to_string(poll),
1000
+ ggml_type_name(type_k),
1001
+ ggml_type_name(type_v),
1002
+ std::to_string(n_gpu_layers),
1003
+ split_mode_str(split_mode),
1004
+ std::to_string(main_gpu),
1005
+ std::to_string(no_kv_offload),
1006
+ std::to_string(flash_attn),
1007
+ tensor_split_str,
1008
+ std::to_string(use_mmap),
1009
+ std::to_string(embeddings),
1010
+ std::to_string(n_prompt),
1011
+ std::to_string(n_gen),
1012
+ test_time,
1013
+ std::to_string(avg_ns()),
1014
+ std::to_string(stdev_ns()),
1015
+ std::to_string(avg_ts()),
1016
+ std::to_string(stdev_ts()) };
942
1017
  return values;
943
1018
  }
944
1019
 
945
1020
  std::map<std::string, std::string> get_map() const {
946
1021
  std::map<std::string, std::string> map;
947
- auto fields = get_fields();
948
- auto values = get_values();
949
- std::transform(fields.begin(), fields.end(), values.begin(),
950
- std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
1022
+ auto fields = get_fields();
1023
+ auto values = get_values();
1024
+ std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
1025
+ std::make_pair<const std::string &, const std::string &>);
951
1026
  return map;
952
1027
  }
953
1028
  };
@@ -961,9 +1036,12 @@ struct printer {
961
1036
  virtual ~printer() {}
962
1037
 
963
1038
  FILE * fout;
1039
+
964
1040
  virtual void print_header(const cmd_params & params) { (void) params; }
1041
+
965
1042
  virtual void print_test(const test & t) = 0;
966
- virtual void print_footer() { }
1043
+
1044
+ virtual void print_footer() {}
967
1045
  };
968
1046
 
969
1047
  struct csv_printer : public printer {
@@ -979,7 +1057,7 @@ struct csv_printer : public printer {
979
1057
  return escaped;
980
1058
  }
981
1059
 
982
- void print_header(const cmd_params & params) override {
1060
+ void print_header(const cmd_params & params) override {
983
1061
  std::vector<std::string> fields = test::get_fields();
984
1062
  fprintf(fout, "%s\n", join(fields, ",").c_str());
985
1063
  (void) params;
@@ -992,7 +1070,6 @@ struct csv_printer : public printer {
992
1070
  }
993
1071
  };
994
1072
 
995
-
996
1073
  static std::string escape_json(const std::string & value) {
997
1074
  std::string escaped;
998
1075
  for (auto c : value) {
@@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) {
1000
1077
  escaped += "\\\"";
1001
1078
  } else if (c == '\\') {
1002
1079
  escaped += "\\\\";
1003
- } else if (c <= 0x1f) {
1080
+ } else if (c <= 0x1f) {
1004
1081
  char buf[8];
1005
1082
  snprintf(buf, sizeof(buf), "\\u%04x", c);
1006
1083
  escaped += buf;
@@ -1033,7 +1110,8 @@ struct json_printer : public printer {
1033
1110
  void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1034
1111
  assert(fields.size() == values.size());
1035
1112
  for (size_t i = 0; i < fields.size(); i++) {
1036
- fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1113
+ fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
1114
+ format_json_value(fields.at(i), values.at(i)).c_str());
1037
1115
  }
1038
1116
  }
1039
1117
 
@@ -1051,12 +1129,9 @@ struct json_printer : public printer {
1051
1129
  fflush(fout);
1052
1130
  }
1053
1131
 
1054
- void print_footer() override {
1055
- fprintf(fout, "\n]\n");
1056
- }
1132
+ void print_footer() override { fprintf(fout, "\n]\n"); }
1057
1133
  };
1058
1134
 
1059
-
1060
1135
  struct jsonl_printer : public printer {
1061
1136
  void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1062
1137
  assert(fields.size() == values.size());
@@ -1116,7 +1191,7 @@ struct markdown_printer : public printer {
1116
1191
  return 13;
1117
1192
  }
1118
1193
 
1119
- int width = std::max((int)field.length(), 10);
1194
+ int width = std::max((int) field.length(), 10);
1120
1195
 
1121
1196
  if (test::get_field_type(field) == test::STRING) {
1122
1197
  return -width;
@@ -1230,18 +1305,18 @@ struct markdown_printer : public printer {
1230
1305
  fprintf(fout, "|");
1231
1306
  for (const auto & field : fields) {
1232
1307
  std::string value;
1233
- char buf[128];
1308
+ char buf[128];
1234
1309
  if (field == "model") {
1235
1310
  value = t.model_type;
1236
1311
  } else if (field == "size") {
1237
- if (t.model_size < 1024*1024*1024) {
1312
+ if (t.model_size < 1024 * 1024 * 1024) {
1238
1313
  snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
1239
1314
  } else {
1240
1315
  snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
1241
1316
  }
1242
1317
  value = buf;
1243
1318
  } else if (field == "params") {
1244
- if (t.model_n_params < 1000*1000*1000) {
1319
+ if (t.model_n_params < 1000 * 1000 * 1000) {
1245
1320
  snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
1246
1321
  } else {
1247
1322
  snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
@@ -1303,7 +1378,8 @@ struct sql_printer : public printer {
1303
1378
  std::vector<std::string> fields = test::get_fields();
1304
1379
  fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
1305
1380
  for (size_t i = 0; i < fields.size(); i++) {
1306
- fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : "");
1381
+ fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
1382
+ i < fields.size() - 1 ? "," : "");
1307
1383
  }
1308
1384
  fprintf(fout, ");\n");
1309
1385
  fprintf(fout, "\n");
@@ -1324,8 +1400,8 @@ struct sql_printer : public printer {
1324
1400
  static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1325
1401
  llama_set_n_threads(ctx, n_threads, n_threads);
1326
1402
 
1327
- const llama_model * model = llama_get_model(ctx);
1328
- const int32_t n_vocab = llama_n_vocab(model);
1403
+ const llama_model * model = llama_get_model(ctx);
1404
+ const int32_t n_vocab = llama_n_vocab(model);
1329
1405
 
1330
1406
  std::vector<llama_token> tokens(n_batch);
1331
1407
 
@@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
1333
1409
 
1334
1410
  while (n_processed < n_prompt) {
1335
1411
  int n_tokens = std::min(n_prompt - n_processed, n_batch);
1336
- tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1412
+ tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1337
1413
  for (int i = 1; i < n_tokens; i++) {
1338
1414
  tokens[i] = std::rand() % n_vocab;
1339
1415
  }
@@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
1347
1423
  static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1348
1424
  llama_set_n_threads(ctx, n_threads, n_threads);
1349
1425
 
1350
- const llama_model * model = llama_get_model(ctx);
1351
- const int32_t n_vocab = llama_n_vocab(model);
1426
+ const llama_model * model = llama_get_model(ctx);
1427
+ const int32_t n_vocab = llama_n_vocab(model);
1352
1428
 
1353
1429
  llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1354
1430
 
@@ -1401,6 +1477,17 @@ int main(int argc, char ** argv) {
1401
1477
 
1402
1478
  cmd_params params = parse_cmd_params(argc, argv);
1403
1479
 
1480
+ // initialize backends
1481
+ ggml_backend_load_all();
1482
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1483
+ if (!cpu_dev) {
1484
+ fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
1485
+ return 1;
1486
+ }
1487
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
1488
+ auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
1489
+ auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
1490
+
1404
1491
  // initialize llama.cpp
1405
1492
  if (!params.verbose) {
1406
1493
  llama_log_set(llama_null_log_callback, NULL);
@@ -1411,7 +1498,7 @@ int main(int argc, char ** argv) {
1411
1498
  set_process_priority(params.prio);
1412
1499
 
1413
1500
  // initialize printer
1414
- std::unique_ptr<printer> p = create_printer(params.output_format);
1501
+ std::unique_ptr<printer> p = create_printer(params.output_format);
1415
1502
  std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
1416
1503
 
1417
1504
  if (p) {
@@ -1426,15 +1513,15 @@ int main(int argc, char ** argv) {
1426
1513
 
1427
1514
  std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
1428
1515
 
1429
- llama_model * lmodel = nullptr;
1516
+ llama_model * lmodel = nullptr;
1430
1517
  const cmd_params_instance * prev_inst = nullptr;
1431
1518
 
1432
- int params_idx = 0;
1519
+ int params_idx = 0;
1433
1520
  auto params_count = params_instances.size();
1434
1521
  for (const auto & inst : params_instances) {
1435
- params_idx ++;
1522
+ params_idx++;
1436
1523
  if (params.progress) {
1437
- fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
1524
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
1438
1525
  }
1439
1526
  // keep the same model between tests when possible
1440
1527
  if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1475,7 +1562,7 @@ int main(int argc, char ** argv) {
1475
1562
  tpp.poll = t.poll;
1476
1563
  tpp.prio = params.prio;
1477
1564
 
1478
- struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
1565
+ struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
1479
1566
  if (!threadpool) {
1480
1567
  fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1481
1568
  exit(1);
@@ -1486,14 +1573,14 @@ int main(int argc, char ** argv) {
1486
1573
  // warmup run
1487
1574
  if (t.n_prompt > 0) {
1488
1575
  if (params.progress) {
1489
- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
1576
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
1490
1577
  }
1491
1578
  //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1492
1579
  test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1493
1580
  }
1494
1581
  if (t.n_gen > 0) {
1495
1582
  if (params.progress) {
1496
- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
1583
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
1497
1584
  }
1498
1585
  test_gen(ctx, 1, t.n_threads);
1499
1586
  }
@@ -1505,13 +1592,15 @@ int main(int argc, char ** argv) {
1505
1592
 
1506
1593
  if (t.n_prompt > 0) {
1507
1594
  if (params.progress) {
1508
- fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1595
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
1596
+ i + 1, params.reps);
1509
1597
  }
1510
1598
  test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1511
1599
  }
1512
1600
  if (t.n_gen > 0) {
1513
1601
  if (params.progress) {
1514
- fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1602
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
1603
+ i + 1, params.reps);
1515
1604
  }
1516
1605
  test_gen(ctx, t.n_gen, t.n_threads);
1517
1606
  }
@@ -1534,7 +1623,7 @@ int main(int argc, char ** argv) {
1534
1623
 
1535
1624
  llama_free(ctx);
1536
1625
 
1537
- ggml_threadpool_free(threadpool);
1626
+ ggml_threadpool_free_fn(threadpool);
1538
1627
  }
1539
1628
 
1540
1629
  llama_free_model(lmodel);