@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -6,34 +6,28 @@
6
6
  #include <clocale>
7
7
  #include <cmath>
8
8
  #include <cstdio>
9
+ #include <cstdlib>
9
10
  #include <cstring>
10
11
  #include <ctime>
11
- #include <cstdlib>
12
12
  #include <iterator>
13
13
  #include <map>
14
14
  #include <numeric>
15
15
  #include <regex>
16
16
  #include <sstream>
17
17
  #include <string>
18
- #include <vector>
19
18
  #include <thread>
19
+ #include <vector>
20
20
 
21
+ #include "common.h"
21
22
  #include "ggml.h"
22
23
  #include "llama.h"
23
- #include "common.h"
24
- #include "ggml-cuda.h"
25
- #include "ggml-sycl.h"
26
-
27
- #ifdef GGML_USE_CANN
28
- #include "ggml-cann.h"
29
- #endif
30
24
 
31
25
  #ifdef _WIN32
32
- #define WIN32_LEAN_AND_MEAN
33
- #ifndef NOMINMAX
34
- # define NOMINMAX
35
- #endif
36
- #include <windows.h>
26
+ # define WIN32_LEAN_AND_MEAN
27
+ # ifndef NOMINMAX
28
+ # define NOMINMAX
29
+ # endif
30
+ # include <windows.h>
37
31
  #endif
38
32
 
39
33
  // utils
@@ -42,8 +36,7 @@ static uint64_t get_time_ns() {
42
36
  return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
43
37
  }
44
38
 
45
- template<class T>
46
- static std::string join(const std::vector<T> & values, const std::string & delim) {
39
+ template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
47
40
  std::ostringstream str;
48
41
  for (size_t i = 0; i < values.size(); i++) {
49
42
  str << values[i];
@@ -54,137 +47,73 @@ static std::string join(const std::vector<T> & values, const std::string & delim
54
47
  return str.str();
55
48
  }
56
49
 
57
- template<typename T, typename F>
58
- static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
50
+ template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
59
51
  std::vector<std::string> str_values;
60
52
  std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
61
53
  return str_values;
62
54
  }
63
55
 
64
- template<typename T>
65
- static T avg(const std::vector<T> & v) {
56
+ template <typename T> static T avg(const std::vector<T> & v) {
66
57
  if (v.empty()) {
67
58
  return 0;
68
59
  }
69
60
  T sum = std::accumulate(v.begin(), v.end(), T(0));
70
- return sum / (T)v.size();
61
+ return sum / (T) v.size();
71
62
  }
72
63
 
73
- template<typename T>
74
- static T stdev(const std::vector<T> & v) {
64
+ template <typename T> static T stdev(const std::vector<T> & v) {
75
65
  if (v.size() <= 1) {
76
66
  return 0;
77
67
  }
78
- T mean = avg(v);
68
+ T mean = avg(v);
79
69
  T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
80
- T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
70
+ T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
81
71
  return stdev;
82
72
  }
83
73
 
84
74
  static std::string get_cpu_info() {
85
- std::string id;
86
- #ifdef __linux__
87
- FILE * f = fopen("/proc/cpuinfo", "r");
88
- if (f) {
89
- char buf[1024];
90
- while (fgets(buf, sizeof(buf), f)) {
91
- if (strncmp(buf, "model name", 10) == 0) {
92
- char * p = strchr(buf, ':');
93
- if (p) {
94
- p++;
95
- while (std::isspace(*p)) {
96
- p++;
97
- }
98
- while (std::isspace(p[strlen(p) - 1])) {
99
- p[strlen(p) - 1] = '\0';
100
- }
101
- id = p;
102
- break;
103
- }
104
- }
75
+ std::vector<std::string> cpu_list;
76
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
77
+ auto * dev = ggml_backend_dev_get(i);
78
+ auto dev_type = ggml_backend_dev_type(dev);
79
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
80
+ cpu_list.push_back(ggml_backend_dev_description(dev));
105
81
  }
106
- fclose(f);
107
- }
108
- #elif defined(_WIN32)
109
- HKEY hKey;
110
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
111
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
112
- 0,
113
- KEY_READ,
114
- &hKey) != ERROR_SUCCESS) {
115
- // fail to open registry key
116
- return "";
117
- }
118
- char cpu_brand[256];
119
- DWORD cpu_brand_size = sizeof(cpu_brand);
120
- if (RegQueryValueExA(hKey,
121
- TEXT("ProcessorNameString"),
122
- NULL,
123
- NULL,
124
- (LPBYTE)cpu_brand,
125
- &cpu_brand_size) == ERROR_SUCCESS) {
126
- id.assign(cpu_brand, cpu_brand_size);
127
- if (id.find('\0') != std::string::npos) {
128
- id.resize(id.find('\0'));
129
- }
130
- }
131
- RegCloseKey(hKey);
132
- #endif
133
- // TODO: other platforms
134
- return id;
82
+ }
83
+ return join(cpu_list, ", ");
135
84
  }
136
85
 
137
86
  static std::string get_gpu_info() {
138
- std::string id;
139
- #ifdef GGML_USE_CUDA
140
- int count = ggml_backend_cuda_get_device_count();
141
- for (int i = 0; i < count; i++) {
142
- char buf[128];
143
- ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
144
- id += buf;
145
- if (i < count - 1) {
146
- id += "/";
147
- }
148
- }
149
- #endif
150
- #ifdef GGML_USE_SYCL
151
- int count = ggml_backend_sycl_get_device_count();
152
- for (int i = 0; i < count; i++) {
153
- char buf[128];
154
- ggml_sycl_get_device_description(i, buf, sizeof(buf));
155
- id += buf;
156
- if (i < count - 1) {
157
- id += "/";
158
- }
159
- }
160
- #endif
161
- #ifdef GGML_USE_CANN
162
- uint32_t count = ggml_backend_cann_get_device_count();
163
- for (uint32_t i = 0; i < count; i++) {
164
- char buf[128];
165
- ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
166
- id += buf;
167
- if (i < count - 1) {
168
- id += "/";
87
+ std::vector<std::string> gpu_list;
88
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
89
+ auto * dev = ggml_backend_dev_get(i);
90
+ auto dev_type = ggml_backend_dev_type(dev);
91
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
92
+ gpu_list.push_back(ggml_backend_dev_description(dev));
169
93
  }
170
94
  }
171
- #endif
172
- // TODO: other backends
173
- return id;
95
+ return join(gpu_list, ", ");
174
96
  }
175
97
 
176
98
  // command line params
177
- enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
99
+ enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
178
100
 
179
101
  static const char * output_format_str(output_formats format) {
180
102
  switch (format) {
181
- case NONE: return "none";
182
- case CSV: return "csv";
183
- case JSON: return "json";
184
- case JSONL: return "jsonl";
185
- case MARKDOWN: return "md";
186
- case SQL: return "sql";
187
- default: GGML_ABORT("invalid output format");
103
+ case NONE:
104
+ return "none";
105
+ case CSV:
106
+ return "csv";
107
+ case JSON:
108
+ return "json";
109
+ case JSONL:
110
+ return "jsonl";
111
+ case MARKDOWN:
112
+ return "md";
113
+ case SQL:
114
+ return "sql";
115
+ default:
116
+ GGML_ABORT("invalid output format");
188
117
  }
189
118
  }
190
119
 
@@ -209,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
209
138
 
210
139
  static const char * split_mode_str(llama_split_mode mode) {
211
140
  switch (mode) {
212
- case LLAMA_SPLIT_MODE_NONE: return "none";
213
- case LLAMA_SPLIT_MODE_LAYER: return "layer";
214
- case LLAMA_SPLIT_MODE_ROW: return "row";
215
- default: GGML_ABORT("invalid split mode");
141
+ case LLAMA_SPLIT_MODE_NONE:
142
+ return "none";
143
+ case LLAMA_SPLIT_MODE_LAYER:
144
+ return "layer";
145
+ case LLAMA_SPLIT_MODE_ROW:
146
+ return "row";
147
+ default:
148
+ GGML_ABORT("invalid split mode");
216
149
  }
217
150
  }
218
151
 
@@ -223,59 +156,59 @@ static std::string pair_str(const std::pair<int, int> & p) {
223
156
  }
224
157
 
225
158
  struct cmd_params {
226
- std::vector<std::string> model;
227
- std::vector<int> n_prompt;
228
- std::vector<int> n_gen;
159
+ std::vector<std::string> model;
160
+ std::vector<int> n_prompt;
161
+ std::vector<int> n_gen;
229
162
  std::vector<std::pair<int, int>> n_pg;
230
- std::vector<int> n_batch;
231
- std::vector<int> n_ubatch;
232
- std::vector<ggml_type> type_k;
233
- std::vector<ggml_type> type_v;
234
- std::vector<int> n_threads;
235
- std::vector<std::string> cpu_mask;
236
- std::vector<bool> cpu_strict;
237
- std::vector<int> poll;
238
- std::vector<int> n_gpu_layers;
239
- std::vector<std::string> rpc_servers;
240
- std::vector<llama_split_mode> split_mode;
241
- std::vector<int> main_gpu;
242
- std::vector<bool> no_kv_offload;
243
- std::vector<bool> flash_attn;
244
- std::vector<std::vector<float>> tensor_split;
245
- std::vector<bool> use_mmap;
246
- std::vector<bool> embeddings;
247
- ggml_numa_strategy numa;
248
- int reps;
249
- ggml_sched_priority prio;
250
- int delay;
251
- bool verbose;
252
- bool progress;
253
- output_formats output_format;
254
- output_formats output_format_stderr;
163
+ std::vector<int> n_batch;
164
+ std::vector<int> n_ubatch;
165
+ std::vector<ggml_type> type_k;
166
+ std::vector<ggml_type> type_v;
167
+ std::vector<int> n_threads;
168
+ std::vector<std::string> cpu_mask;
169
+ std::vector<bool> cpu_strict;
170
+ std::vector<int> poll;
171
+ std::vector<int> n_gpu_layers;
172
+ std::vector<std::string> rpc_servers;
173
+ std::vector<llama_split_mode> split_mode;
174
+ std::vector<int> main_gpu;
175
+ std::vector<bool> no_kv_offload;
176
+ std::vector<bool> flash_attn;
177
+ std::vector<std::vector<float>> tensor_split;
178
+ std::vector<bool> use_mmap;
179
+ std::vector<bool> embeddings;
180
+ ggml_numa_strategy numa;
181
+ int reps;
182
+ ggml_sched_priority prio;
183
+ int delay;
184
+ bool verbose;
185
+ bool progress;
186
+ output_formats output_format;
187
+ output_formats output_format_stderr;
255
188
  };
256
189
 
257
190
  static const cmd_params cmd_params_defaults = {
258
- /* model */ {"models/7B/ggml-model-q4_0.gguf"},
259
- /* n_prompt */ {512},
260
- /* n_gen */ {128},
191
+ /* model */ { "models/7B/ggml-model-q4_0.gguf" },
192
+ /* n_prompt */ { 512 },
193
+ /* n_gen */ { 128 },
261
194
  /* n_pg */ {},
262
- /* n_batch */ {2048},
263
- /* n_ubatch */ {512},
264
- /* type_k */ {GGML_TYPE_F16},
265
- /* type_v */ {GGML_TYPE_F16},
266
- /* n_threads */ {cpu_get_num_math()},
267
- /* cpu_mask */ {"0x0"},
268
- /* cpu_strict */ {false},
269
- /* poll */ {50},
270
- /* n_gpu_layers */ {99},
271
- /* rpc_servers */ {""},
272
- /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
273
- /* main_gpu */ {0},
274
- /* no_kv_offload */ {false},
275
- /* flash_attn */ {false},
276
- /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
277
- /* use_mmap */ {true},
278
- /* embeddings */ {false},
195
+ /* n_batch */ { 2048 },
196
+ /* n_ubatch */ { 512 },
197
+ /* type_k */ { GGML_TYPE_F16 },
198
+ /* type_v */ { GGML_TYPE_F16 },
199
+ /* n_threads */ { cpu_get_num_math() },
200
+ /* cpu_mask */ { "0x0" },
201
+ /* cpu_strict */ { false },
202
+ /* poll */ { 50 },
203
+ /* n_gpu_layers */ { 99 },
204
+ /* rpc_servers */ { "" },
205
+ /* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
206
+ /* main_gpu */ { 0 },
207
+ /* no_kv_offload */ { false },
208
+ /* flash_attn */ { false },
209
+ /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
210
+ /* use_mmap */ { true },
211
+ /* embeddings */ { false },
279
212
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
280
213
  /* reps */ 5,
281
214
  /* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -292,44 +225,68 @@ static void print_usage(int /* argc */, char ** argv) {
292
225
  printf("options:\n");
293
226
  printf(" -h, --help\n");
294
227
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
295
- printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
228
+ printf(" -p, --n-prompt <n> (default: %s)\n",
229
+ join(cmd_params_defaults.n_prompt, ",").c_str());
296
230
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
297
- printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
298
- printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
299
- printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
300
- printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
301
- printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
302
- printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
303
- printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
304
- printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
231
+ printf(" -pg <pp,tg> (default: %s)\n",
232
+ join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
233
+ printf(" -b, --batch-size <n> (default: %s)\n",
234
+ join(cmd_params_defaults.n_batch, ",").c_str());
235
+ printf(" -ub, --ubatch-size <n> (default: %s)\n",
236
+ join(cmd_params_defaults.n_ubatch, ",").c_str());
237
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n",
238
+ join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
239
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n",
240
+ join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
241
+ printf(" -t, --threads <n> (default: %s)\n",
242
+ join(cmd_params_defaults.n_threads, ",").c_str());
243
+ printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
244
+ join(cmd_params_defaults.cpu_mask, ",").c_str());
245
+ printf(" --cpu-strict <0|1> (default: %s)\n",
246
+ join(cmd_params_defaults.cpu_strict, ",").c_str());
305
247
  printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
306
- printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
307
- #ifdef GGML_USE_RPC
308
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
309
- #endif
310
- printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
311
- printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
312
- printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
313
- printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
314
- printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
248
+ printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
249
+ join(cmd_params_defaults.n_gpu_layers, ",").c_str());
250
+ if (llama_supports_rpc()) {
251
+ printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
252
+ join(cmd_params_defaults.rpc_servers, ",").c_str());
253
+ }
254
+ printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
255
+ join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
256
+ printf(" -mg, --main-gpu <i> (default: %s)\n",
257
+ join(cmd_params_defaults.main_gpu, ",").c_str());
258
+ printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
259
+ join(cmd_params_defaults.no_kv_offload, ",").c_str());
260
+ printf(" -fa, --flash-attn <0|1> (default: %s)\n",
261
+ join(cmd_params_defaults.flash_attn, ",").c_str());
262
+ printf(" -mmp, --mmap <0|1> (default: %s)\n",
263
+ join(cmd_params_defaults.use_mmap, ",").c_str());
315
264
  printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
316
- printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
265
+ printf(" -embd, --embeddings <0|1> (default: %s)\n",
266
+ join(cmd_params_defaults.embeddings, ",").c_str());
317
267
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
318
268
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
319
269
  printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
320
270
  printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
321
- printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
322
- printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
271
+ printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
272
+ output_format_str(cmd_params_defaults.output_format));
273
+ printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
274
+ output_format_str(cmd_params_defaults.output_format_stderr));
323
275
  printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
324
276
  printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
325
277
  printf("\n");
326
- printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
278
+ printf(
279
+ "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
280
+ "multiple times.\n");
327
281
  }
328
282
 
329
283
  static ggml_type ggml_type_from_name(const std::string & s) {
330
284
  if (s == "f16") {
331
285
  return GGML_TYPE_F16;
332
286
  }
287
+ if (s == "bf16") {
288
+ return GGML_TYPE_BF16;
289
+ }
333
290
  if (s == "q8_0") {
334
291
  return GGML_TYPE_Q8_0;
335
292
  }
@@ -352,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) {
352
309
  return GGML_TYPE_COUNT;
353
310
  }
354
311
 
355
-
356
312
  static cmd_params parse_cmd_params(int argc, char ** argv) {
357
- cmd_params params;
358
- std::string arg;
359
- bool invalid_param = false;
360
- const std::string arg_prefix = "--";
361
- const char split_delim = ',';
362
-
363
- params.verbose = cmd_params_defaults.verbose;
364
- params.output_format = cmd_params_defaults.output_format;
313
+ cmd_params params;
314
+ std::string arg;
315
+ bool invalid_param = false;
316
+ const std::string arg_prefix = "--";
317
+ const char split_delim = ',';
318
+
319
+ params.verbose = cmd_params_defaults.verbose;
320
+ params.output_format = cmd_params_defaults.output_format;
365
321
  params.output_format_stderr = cmd_params_defaults.output_format_stderr;
366
- params.reps = cmd_params_defaults.reps;
367
- params.numa = cmd_params_defaults.numa;
368
- params.prio = cmd_params_defaults.prio;
369
- params.delay = cmd_params_defaults.delay;
370
- params.progress = cmd_params_defaults.progress;
322
+ params.reps = cmd_params_defaults.reps;
323
+ params.numa = cmd_params_defaults.numa;
324
+ params.prio = cmd_params_defaults.prio;
325
+ params.delay = cmd_params_defaults.delay;
326
+ params.progress = cmd_params_defaults.progress;
371
327
 
372
328
  for (int i = 1; i < argc; i++) {
373
329
  arg = argv[i];
@@ -409,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
409
365
  invalid_param = true;
410
366
  break;
411
367
  }
412
- params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
368
+ params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
413
369
  } else if (arg == "-b" || arg == "--batch-size") {
414
370
  if (++i >= argc) {
415
371
  invalid_param = true;
@@ -429,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
429
385
  invalid_param = true;
430
386
  break;
431
387
  }
432
- auto p = string_split<std::string>(argv[i], split_delim);
388
+ auto p = string_split<std::string>(argv[i], split_delim);
433
389
  std::vector<ggml_type> types;
434
390
  for (const auto & t : p) {
435
391
  ggml_type gt = ggml_type_from_name(t);
@@ -448,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
448
404
  invalid_param = true;
449
405
  break;
450
406
  }
451
- auto p = string_split<std::string>(argv[i], split_delim);
407
+ auto p = string_split<std::string>(argv[i], split_delim);
452
408
  std::vector<ggml_type> types;
453
409
  for (const auto & t : p) {
454
410
  ggml_type gt = ggml_type_from_name(t);
@@ -497,20 +453,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
497
453
  }
498
454
  auto p = string_split<int>(argv[i], split_delim);
499
455
  params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
500
- #ifdef GGML_USE_RPC
501
- } else if (arg == "-rpc" || arg == "--rpc") {
456
+ } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
502
457
  if (++i >= argc) {
503
458
  invalid_param = true;
504
459
  break;
505
460
  }
506
461
  params.rpc_servers.push_back(argv[i]);
507
- #endif
508
462
  } else if (arg == "-sm" || arg == "--split-mode") {
509
463
  if (++i >= argc) {
510
464
  invalid_param = true;
511
465
  break;
512
466
  }
513
- auto p = string_split<std::string>(argv[i], split_delim);
467
+ auto p = string_split<std::string>(argv[i], split_delim);
514
468
  std::vector<llama_split_mode> modes;
515
469
  for (const auto & m : p) {
516
470
  llama_split_mode mode;
@@ -549,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
549
503
  break;
550
504
  } else {
551
505
  std::string value(argv[i]);
552
- /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
553
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
554
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
555
- else { invalid_param = true; break; }
506
+ /**/ if (value == "distribute" || value == "") {
507
+ params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
508
+ } else if (value == "isolate") {
509
+ params.numa = GGML_NUMA_STRATEGY_ISOLATE;
510
+ } else if (value == "numactl") {
511
+ params.numa = GGML_NUMA_STRATEGY_NUMACTL;
512
+ } else {
513
+ invalid_param = true;
514
+ break;
515
+ }
556
516
  }
557
517
  } else if (arg == "-fa" || arg == "--flash-attn") {
558
518
  if (++i >= argc) {
@@ -582,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
582
542
  }
583
543
  for (auto ts : string_split<std::string>(argv[i], split_delim)) {
584
544
  // split string by ; and /
585
- const std::regex regex{R"([;/]+)"};
586
- std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
587
- std::vector<std::string> split_arg{it, {}};
545
+ const std::regex regex{ R"([;/]+)" };
546
+ std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
547
+ std::vector<std::string> split_arg{ it, {} };
588
548
  GGML_ASSERT(split_arg.size() <= llama_max_devices());
589
549
 
590
550
  std::vector<float> tensor_split(llama_max_devices());
@@ -643,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
643
603
  }
644
604
 
645
605
  // set defaults
646
- if (params.model.empty()) { params.model = cmd_params_defaults.model; }
647
- if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
648
- if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
649
- if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
650
- if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
651
- if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
652
- if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
653
- if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
654
- if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
655
- if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
656
- if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
657
- if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
658
- if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
659
- if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
660
- if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
661
- if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
662
- if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
663
- if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
664
- if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
665
- if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
666
- if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
606
+ if (params.model.empty()) {
607
+ params.model = cmd_params_defaults.model;
608
+ }
609
+ if (params.n_prompt.empty()) {
610
+ params.n_prompt = cmd_params_defaults.n_prompt;
611
+ }
612
+ if (params.n_gen.empty()) {
613
+ params.n_gen = cmd_params_defaults.n_gen;
614
+ }
615
+ if (params.n_pg.empty()) {
616
+ params.n_pg = cmd_params_defaults.n_pg;
617
+ }
618
+ if (params.n_batch.empty()) {
619
+ params.n_batch = cmd_params_defaults.n_batch;
620
+ }
621
+ if (params.n_ubatch.empty()) {
622
+ params.n_ubatch = cmd_params_defaults.n_ubatch;
623
+ }
624
+ if (params.type_k.empty()) {
625
+ params.type_k = cmd_params_defaults.type_k;
626
+ }
627
+ if (params.type_v.empty()) {
628
+ params.type_v = cmd_params_defaults.type_v;
629
+ }
630
+ if (params.n_gpu_layers.empty()) {
631
+ params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
632
+ }
633
+ if (params.rpc_servers.empty()) {
634
+ params.rpc_servers = cmd_params_defaults.rpc_servers;
635
+ }
636
+ if (params.split_mode.empty()) {
637
+ params.split_mode = cmd_params_defaults.split_mode;
638
+ }
639
+ if (params.main_gpu.empty()) {
640
+ params.main_gpu = cmd_params_defaults.main_gpu;
641
+ }
642
+ if (params.no_kv_offload.empty()) {
643
+ params.no_kv_offload = cmd_params_defaults.no_kv_offload;
644
+ }
645
+ if (params.flash_attn.empty()) {
646
+ params.flash_attn = cmd_params_defaults.flash_attn;
647
+ }
648
+ if (params.tensor_split.empty()) {
649
+ params.tensor_split = cmd_params_defaults.tensor_split;
650
+ }
651
+ if (params.use_mmap.empty()) {
652
+ params.use_mmap = cmd_params_defaults.use_mmap;
653
+ }
654
+ if (params.embeddings.empty()) {
655
+ params.embeddings = cmd_params_defaults.embeddings;
656
+ }
657
+ if (params.n_threads.empty()) {
658
+ params.n_threads = cmd_params_defaults.n_threads;
659
+ }
660
+ if (params.cpu_mask.empty()) {
661
+ params.cpu_mask = cmd_params_defaults.cpu_mask;
662
+ }
663
+ if (params.cpu_strict.empty()) {
664
+ params.cpu_strict = cmd_params_defaults.cpu_strict;
665
+ }
666
+ if (params.poll.empty()) {
667
+ params.poll = cmd_params_defaults.poll;
668
+ }
667
669
 
668
670
  return params;
669
671
  }
670
672
 
671
673
  struct cmd_params_instance {
672
- std::string model;
673
- int n_prompt;
674
- int n_gen;
675
- int n_batch;
676
- int n_ubatch;
677
- ggml_type type_k;
678
- ggml_type type_v;
679
- int n_threads;
680
- std::string cpu_mask;
681
- bool cpu_strict;
682
- int poll;
683
- int n_gpu_layers;
684
- std::string rpc_servers;
685
- llama_split_mode split_mode;
686
- int main_gpu;
687
- bool no_kv_offload;
688
- bool flash_attn;
674
+ std::string model;
675
+ int n_prompt;
676
+ int n_gen;
677
+ int n_batch;
678
+ int n_ubatch;
679
+ ggml_type type_k;
680
+ ggml_type type_v;
681
+ int n_threads;
682
+ std::string cpu_mask;
683
+ bool cpu_strict;
684
+ int poll;
685
+ int n_gpu_layers;
686
+ std::string rpc_servers;
687
+ llama_split_mode split_mode;
688
+ int main_gpu;
689
+ bool no_kv_offload;
690
+ bool flash_attn;
689
691
  std::vector<float> tensor_split;
690
- bool use_mmap;
691
- bool embeddings;
692
+ bool use_mmap;
693
+ bool embeddings;
692
694
 
693
695
  llama_model_params to_llama_mparams() const {
694
696
  llama_model_params mparams = llama_model_default_params();
@@ -697,35 +699,31 @@ struct cmd_params_instance {
697
699
  if (!rpc_servers.empty()) {
698
700
  mparams.rpc_servers = rpc_servers.c_str();
699
701
  }
700
- mparams.split_mode = split_mode;
701
- mparams.main_gpu = main_gpu;
702
+ mparams.split_mode = split_mode;
703
+ mparams.main_gpu = main_gpu;
702
704
  mparams.tensor_split = tensor_split.data();
703
- mparams.use_mmap = use_mmap;
705
+ mparams.use_mmap = use_mmap;
704
706
 
705
707
  return mparams;
706
708
  }
707
709
 
708
710
  bool equal_mparams(const cmd_params_instance & other) const {
709
- return model == other.model &&
710
- n_gpu_layers == other.n_gpu_layers &&
711
- rpc_servers == other.rpc_servers &&
712
- split_mode == other.split_mode &&
713
- main_gpu == other.main_gpu &&
714
- use_mmap == other.use_mmap &&
711
+ return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
712
+ split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
715
713
  tensor_split == other.tensor_split;
716
714
  }
717
715
 
718
716
  llama_context_params to_llama_cparams() const {
719
717
  llama_context_params cparams = llama_context_default_params();
720
718
 
721
- cparams.n_ctx = n_prompt + n_gen;
722
- cparams.n_batch = n_batch;
723
- cparams.n_ubatch = n_ubatch;
724
- cparams.type_k = type_k;
725
- cparams.type_v = type_v;
719
+ cparams.n_ctx = n_prompt + n_gen;
720
+ cparams.n_batch = n_batch;
721
+ cparams.n_ubatch = n_ubatch;
722
+ cparams.type_k = type_k;
723
+ cparams.type_v = type_v;
726
724
  cparams.offload_kqv = !no_kv_offload;
727
- cparams.flash_attn = flash_attn;
728
- cparams.embeddings = embeddings;
725
+ cparams.flash_attn = flash_attn;
726
+ cparams.embeddings = embeddings;
729
727
 
730
728
  return cparams;
731
729
  }
@@ -735,6 +733,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
735
733
  std::vector<cmd_params_instance> instances;
736
734
 
737
735
  // this ordering minimizes the number of times that each model needs to be reloaded
736
+ // clang-format off
738
737
  for (const auto & m : params.model)
739
738
  for (const auto & nl : params.n_gpu_layers)
740
739
  for (const auto & rpc : params.rpc_servers)
@@ -840,165 +839,125 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
840
839
  instances.push_back(instance);
841
840
  }
842
841
  }
842
+ // clang-format on
843
843
 
844
844
  return instances;
845
845
  }
846
846
 
847
847
  struct test {
848
848
  static const std::string build_commit;
849
- static const int build_number;
850
- static const bool cuda;
851
- static const bool vulkan;
852
- static const bool kompute;
853
- static const bool metal;
854
- static const bool sycl;
855
- static const bool gpu_blas;
856
- static const bool blas;
849
+ static const int build_number;
857
850
  static const std::string cpu_info;
858
851
  static const std::string gpu_info;
859
- std::string model_filename;
860
- std::string model_type;
861
- uint64_t model_size;
862
- uint64_t model_n_params;
863
- int n_batch;
864
- int n_ubatch;
865
- int n_threads;
866
- std::string cpu_mask;
867
- bool cpu_strict;
868
- int poll;
869
- bool has_rpc;
870
- ggml_type type_k;
871
- ggml_type type_v;
872
- int n_gpu_layers;
873
- llama_split_mode split_mode;
874
- int main_gpu;
875
- bool no_kv_offload;
876
- bool flash_attn;
877
- std::vector<float> tensor_split;
878
- bool use_mmap;
879
- bool embeddings;
880
- int n_prompt;
881
- int n_gen;
882
- std::string test_time;
883
- std::vector<uint64_t> samples_ns;
852
+ std::string model_filename;
853
+ std::string model_type;
854
+ uint64_t model_size;
855
+ uint64_t model_n_params;
856
+ int n_batch;
857
+ int n_ubatch;
858
+ int n_threads;
859
+ std::string cpu_mask;
860
+ bool cpu_strict;
861
+ int poll;
862
+ ggml_type type_k;
863
+ ggml_type type_v;
864
+ int n_gpu_layers;
865
+ llama_split_mode split_mode;
866
+ int main_gpu;
867
+ bool no_kv_offload;
868
+ bool flash_attn;
869
+ std::vector<float> tensor_split;
870
+ bool use_mmap;
871
+ bool embeddings;
872
+ int n_prompt;
873
+ int n_gen;
874
+ std::string test_time;
875
+ std::vector<uint64_t> samples_ns;
884
876
 
885
877
  test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
886
878
  model_filename = inst.model;
887
879
  char buf[128];
888
880
  llama_model_desc(lmodel, buf, sizeof(buf));
889
- model_type = buf;
890
- model_size = llama_model_size(lmodel);
881
+ model_type = buf;
882
+ model_size = llama_model_size(lmodel);
891
883
  model_n_params = llama_model_n_params(lmodel);
892
- n_batch = inst.n_batch;
893
- n_ubatch = inst.n_ubatch;
894
- n_threads = inst.n_threads;
895
- cpu_mask = inst.cpu_mask;
896
- cpu_strict = inst.cpu_strict;
897
- poll = inst.poll;
898
- has_rpc = !inst.rpc_servers.empty();
899
- type_k = inst.type_k;
900
- type_v = inst.type_v;
901
- n_gpu_layers = inst.n_gpu_layers;
902
- split_mode = inst.split_mode;
903
- main_gpu = inst.main_gpu;
904
- no_kv_offload = inst.no_kv_offload;
905
- flash_attn = inst.flash_attn;
906
- tensor_split = inst.tensor_split;
907
- use_mmap = inst.use_mmap;
908
- embeddings = inst.embeddings;
909
- n_prompt = inst.n_prompt;
910
- n_gen = inst.n_gen;
884
+ n_batch = inst.n_batch;
885
+ n_ubatch = inst.n_ubatch;
886
+ n_threads = inst.n_threads;
887
+ cpu_mask = inst.cpu_mask;
888
+ cpu_strict = inst.cpu_strict;
889
+ poll = inst.poll;
890
+ type_k = inst.type_k;
891
+ type_v = inst.type_v;
892
+ n_gpu_layers = inst.n_gpu_layers;
893
+ split_mode = inst.split_mode;
894
+ main_gpu = inst.main_gpu;
895
+ no_kv_offload = inst.no_kv_offload;
896
+ flash_attn = inst.flash_attn;
897
+ tensor_split = inst.tensor_split;
898
+ use_mmap = inst.use_mmap;
899
+ embeddings = inst.embeddings;
900
+ n_prompt = inst.n_prompt;
901
+ n_gen = inst.n_gen;
911
902
  // RFC 3339 date-time format
912
- time_t t = time(NULL);
903
+ time_t t = time(NULL);
913
904
  std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
914
905
  test_time = buf;
915
906
 
916
907
  (void) ctx;
917
908
  }
918
909
 
919
- uint64_t avg_ns() const {
920
- return ::avg(samples_ns);
921
- }
910
+ uint64_t avg_ns() const { return ::avg(samples_ns); }
922
911
 
923
- uint64_t stdev_ns() const {
924
- return ::stdev(samples_ns);
925
- }
912
+ uint64_t stdev_ns() const { return ::stdev(samples_ns); }
926
913
 
927
914
  std::vector<double> get_ts() const {
928
- int n_tokens = n_prompt + n_gen;
915
+ int n_tokens = n_prompt + n_gen;
929
916
  std::vector<double> ts;
930
- std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
917
+ std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
918
+ [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
931
919
  return ts;
932
920
  }
933
921
 
934
- double avg_ts() const {
935
- return ::avg(get_ts());
936
- }
922
+ double avg_ts() const { return ::avg(get_ts()); }
937
923
 
938
- double stdev_ts() const {
939
- return ::stdev(get_ts());
940
- }
924
+ double stdev_ts() const { return ::stdev(get_ts()); }
941
925
 
942
926
  static std::string get_backend() {
943
- if (cuda) {
944
- return GGML_CUDA_NAME;
945
- }
946
- if (vulkan) {
947
- return "Vulkan";
948
- }
949
- if (kompute) {
950
- return "Kompute";
951
- }
952
- if (metal) {
953
- return "Metal";
954
- }
955
- if (sycl) {
956
- return GGML_SYCL_NAME;
957
- }
958
- if (gpu_blas) {
959
- return "GPU BLAS";
960
- }
961
- if (blas) {
962
- return "BLAS";
927
+ std::vector<std::string> backends;
928
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
929
+ auto * reg = ggml_backend_reg_get(i);
930
+ std::string name = ggml_backend_reg_name(reg);
931
+ if (name != "CPU") {
932
+ backends.push_back(ggml_backend_reg_name(reg));
933
+ }
963
934
  }
964
-
965
- return "CPU";
935
+ return backends.empty() ? "CPU" : join(backends, ",");
966
936
  }
967
937
 
968
938
  static const std::vector<std::string> & get_fields() {
969
939
  static const std::vector<std::string> fields = {
970
- "build_commit", "build_number",
971
- "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
972
- "cpu_info", "gpu_info",
973
- "model_filename", "model_type", "model_size", "model_n_params",
974
- "n_batch", "n_ubatch",
975
- "n_threads", "cpu_mask", "cpu_strict", "poll",
976
- "type_k", "type_v",
977
- "n_gpu_layers", "split_mode",
978
- "main_gpu", "no_kv_offload", "flash_attn",
979
- "tensor_split", "use_mmap", "embeddings",
980
- "n_prompt", "n_gen", "test_time",
981
- "avg_ns", "stddev_ns",
982
- "avg_ts", "stddev_ts",
940
+ "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
941
+ "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
942
+ "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
943
+ "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
944
+ "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
945
+ "avg_ts", "stddev_ts",
983
946
  };
984
947
  return fields;
985
948
  }
986
949
 
987
- enum field_type {STRING, BOOL, INT, FLOAT};
950
+ enum field_type { STRING, BOOL, INT, FLOAT };
988
951
 
989
952
  static field_type get_field_type(const std::string & field) {
990
- if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
991
- field == "n_threads" || field == "poll" ||
992
- field == "model_size" || field == "model_n_params" ||
993
- field == "n_gpu_layers" || field == "main_gpu" ||
994
- field == "n_prompt" || field == "n_gen" ||
995
- field == "avg_ns" || field == "stddev_ns") {
953
+ if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
954
+ field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
955
+ field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
956
+ field == "stddev_ns") {
996
957
  return INT;
997
958
  }
998
- if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
999
- field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
1000
- field == "cpu_strict" ||
1001
- field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
959
+ if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
960
+ field == "use_mmap" || field == "embeddings") {
1002
961
  return BOOL;
1003
962
  }
1004
963
  if (field == "avg_ts" || field == "stddev_ts") {
@@ -1009,7 +968,7 @@ struct test {
1009
968
 
1010
969
  std::vector<std::string> get_values() const {
1011
970
  std::string tensor_split_str;
1012
- int max_nonzero = 0;
971
+ int max_nonzero = 0;
1013
972
  for (size_t i = 0; i < llama_max_devices(); i++) {
1014
973
  if (tensor_split[i] > 0) {
1015
974
  max_nonzero = i;
@@ -1023,44 +982,53 @@ struct test {
1023
982
  tensor_split_str += "/";
1024
983
  }
1025
984
  }
1026
- std::vector<std::string> values = {
1027
- build_commit, std::to_string(build_number),
1028
- std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
1029
- std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
1030
- cpu_info, gpu_info,
1031
- model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
1032
- std::to_string(n_batch), std::to_string(n_ubatch),
1033
- std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
1034
- ggml_type_name(type_k), ggml_type_name(type_v),
1035
- std::to_string(n_gpu_layers), split_mode_str(split_mode),
1036
- std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
1037
- tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
1038
- std::to_string(n_prompt), std::to_string(n_gen), test_time,
1039
- std::to_string(avg_ns()), std::to_string(stdev_ns()),
1040
- std::to_string(avg_ts()), std::to_string(stdev_ts())
1041
- };
985
+ std::vector<std::string> values = { build_commit,
986
+ std::to_string(build_number),
987
+ cpu_info,
988
+ gpu_info,
989
+ get_backend(),
990
+ model_filename,
991
+ model_type,
992
+ std::to_string(model_size),
993
+ std::to_string(model_n_params),
994
+ std::to_string(n_batch),
995
+ std::to_string(n_ubatch),
996
+ std::to_string(n_threads),
997
+ cpu_mask,
998
+ std::to_string(cpu_strict),
999
+ std::to_string(poll),
1000
+ ggml_type_name(type_k),
1001
+ ggml_type_name(type_v),
1002
+ std::to_string(n_gpu_layers),
1003
+ split_mode_str(split_mode),
1004
+ std::to_string(main_gpu),
1005
+ std::to_string(no_kv_offload),
1006
+ std::to_string(flash_attn),
1007
+ tensor_split_str,
1008
+ std::to_string(use_mmap),
1009
+ std::to_string(embeddings),
1010
+ std::to_string(n_prompt),
1011
+ std::to_string(n_gen),
1012
+ test_time,
1013
+ std::to_string(avg_ns()),
1014
+ std::to_string(stdev_ns()),
1015
+ std::to_string(avg_ts()),
1016
+ std::to_string(stdev_ts()) };
1042
1017
  return values;
1043
1018
  }
1044
1019
 
1045
1020
  std::map<std::string, std::string> get_map() const {
1046
1021
  std::map<std::string, std::string> map;
1047
- auto fields = get_fields();
1048
- auto values = get_values();
1049
- std::transform(fields.begin(), fields.end(), values.begin(),
1050
- std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
1022
+ auto fields = get_fields();
1023
+ auto values = get_values();
1024
+ std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
1025
+ std::make_pair<const std::string &, const std::string &>);
1051
1026
  return map;
1052
1027
  }
1053
1028
  };
1054
1029
 
1055
1030
  const std::string test::build_commit = LLAMA_COMMIT;
1056
1031
  const int test::build_number = LLAMA_BUILD_NUMBER;
1057
- const bool test::cuda = !!ggml_cpu_has_cuda();
1058
- const bool test::vulkan = !!ggml_cpu_has_vulkan();
1059
- const bool test::kompute = !!ggml_cpu_has_kompute();
1060
- const bool test::metal = !!ggml_cpu_has_metal();
1061
- const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
1062
- const bool test::blas = !!ggml_cpu_has_blas();
1063
- const bool test::sycl = !!ggml_cpu_has_sycl();
1064
1032
  const std::string test::cpu_info = get_cpu_info();
1065
1033
  const std::string test::gpu_info = get_gpu_info();
1066
1034
 
@@ -1068,9 +1036,12 @@ struct printer {
1068
1036
  virtual ~printer() {}
1069
1037
 
1070
1038
  FILE * fout;
1039
+
1071
1040
  virtual void print_header(const cmd_params & params) { (void) params; }
1041
+
1072
1042
  virtual void print_test(const test & t) = 0;
1073
- virtual void print_footer() { }
1043
+
1044
+ virtual void print_footer() {}
1074
1045
  };
1075
1046
 
1076
1047
  struct csv_printer : public printer {
@@ -1086,7 +1057,7 @@ struct csv_printer : public printer {
1086
1057
  return escaped;
1087
1058
  }
1088
1059
 
1089
- void print_header(const cmd_params & params) override {
1060
+ void print_header(const cmd_params & params) override {
1090
1061
  std::vector<std::string> fields = test::get_fields();
1091
1062
  fprintf(fout, "%s\n", join(fields, ",").c_str());
1092
1063
  (void) params;
@@ -1099,7 +1070,6 @@ struct csv_printer : public printer {
1099
1070
  }
1100
1071
  };
1101
1072
 
1102
-
1103
1073
  static std::string escape_json(const std::string & value) {
1104
1074
  std::string escaped;
1105
1075
  for (auto c : value) {
@@ -1107,7 +1077,7 @@ static std::string escape_json(const std::string & value) {
1107
1077
  escaped += "\\\"";
1108
1078
  } else if (c == '\\') {
1109
1079
  escaped += "\\\\";
1110
- } else if (c <= 0x1f) {
1080
+ } else if (c <= 0x1f) {
1111
1081
  char buf[8];
1112
1082
  snprintf(buf, sizeof(buf), "\\u%04x", c);
1113
1083
  escaped += buf;
@@ -1140,7 +1110,8 @@ struct json_printer : public printer {
1140
1110
  void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1141
1111
  assert(fields.size() == values.size());
1142
1112
  for (size_t i = 0; i < fields.size(); i++) {
1143
- fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1113
+ fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
1114
+ format_json_value(fields.at(i), values.at(i)).c_str());
1144
1115
  }
1145
1116
  }
1146
1117
 
@@ -1158,12 +1129,9 @@ struct json_printer : public printer {
1158
1129
  fflush(fout);
1159
1130
  }
1160
1131
 
1161
- void print_footer() override {
1162
- fprintf(fout, "\n]\n");
1163
- }
1132
+ void print_footer() override { fprintf(fout, "\n]\n"); }
1164
1133
  };
1165
1134
 
1166
-
1167
1135
  struct jsonl_printer : public printer {
1168
1136
  void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1169
1137
  assert(fields.size() == values.size());
@@ -1223,7 +1191,7 @@ struct markdown_printer : public printer {
1223
1191
  return 13;
1224
1192
  }
1225
1193
 
1226
- int width = std::max((int)field.length(), 10);
1194
+ int width = std::max((int) field.length(), 10);
1227
1195
 
1228
1196
  if (test::get_field_type(field) == test::STRING) {
1229
1197
  return -width;
@@ -1265,7 +1233,8 @@ struct markdown_printer : public printer {
1265
1233
  fields.emplace_back("size");
1266
1234
  fields.emplace_back("params");
1267
1235
  fields.emplace_back("backend");
1268
- bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
1236
+ bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
1237
+ test::get_backend().find("BLAS") != std::string::npos;
1269
1238
  if (!is_cpu_backend) {
1270
1239
  fields.emplace_back("n_gpu_layers");
1271
1240
  }
@@ -1336,18 +1305,18 @@ struct markdown_printer : public printer {
1336
1305
  fprintf(fout, "|");
1337
1306
  for (const auto & field : fields) {
1338
1307
  std::string value;
1339
- char buf[128];
1308
+ char buf[128];
1340
1309
  if (field == "model") {
1341
1310
  value = t.model_type;
1342
1311
  } else if (field == "size") {
1343
- if (t.model_size < 1024*1024*1024) {
1312
+ if (t.model_size < 1024 * 1024 * 1024) {
1344
1313
  snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
1345
1314
  } else {
1346
1315
  snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
1347
1316
  }
1348
1317
  value = buf;
1349
1318
  } else if (field == "params") {
1350
- if (t.model_n_params < 1000*1000*1000) {
1319
+ if (t.model_n_params < 1000 * 1000 * 1000) {
1351
1320
  snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
1352
1321
  } else {
1353
1322
  snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
@@ -1355,9 +1324,6 @@ struct markdown_printer : public printer {
1355
1324
  value = buf;
1356
1325
  } else if (field == "backend") {
1357
1326
  value = test::get_backend();
1358
- if (t.has_rpc) {
1359
- value += "+RPC";
1360
- }
1361
1327
  } else if (field == "test") {
1362
1328
  if (t.n_prompt > 0 && t.n_gen == 0) {
1363
1329
  snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
@@ -1412,7 +1378,8 @@ struct sql_printer : public printer {
1412
1378
  std::vector<std::string> fields = test::get_fields();
1413
1379
  fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
1414
1380
  for (size_t i = 0; i < fields.size(); i++) {
1415
- fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : "");
1381
+ fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
1382
+ i < fields.size() - 1 ? "," : "");
1416
1383
  }
1417
1384
  fprintf(fout, ");\n");
1418
1385
  fprintf(fout, "\n");
@@ -1430,11 +1397,11 @@ struct sql_printer : public printer {
1430
1397
  }
1431
1398
  };
1432
1399
 
1433
- static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
1400
+ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1434
1401
  llama_set_n_threads(ctx, n_threads, n_threads);
1435
1402
 
1436
- const llama_model * model = llama_get_model(ctx);
1437
- const int32_t n_vocab = llama_n_vocab(model);
1403
+ const llama_model * model = llama_get_model(ctx);
1404
+ const int32_t n_vocab = llama_n_vocab(model);
1438
1405
 
1439
1406
  std::vector<llama_token> tokens(n_batch);
1440
1407
 
@@ -1442,27 +1409,27 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
1442
1409
 
1443
1410
  while (n_processed < n_prompt) {
1444
1411
  int n_tokens = std::min(n_prompt - n_processed, n_batch);
1445
- tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1412
+ tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1446
1413
  for (int i = 1; i < n_tokens; i++) {
1447
1414
  tokens[i] = std::rand() % n_vocab;
1448
1415
  }
1449
- llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
1416
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1450
1417
  n_processed += n_tokens;
1451
1418
  }
1452
1419
 
1453
1420
  llama_synchronize(ctx);
1454
1421
  }
1455
1422
 
1456
- static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
1423
+ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1457
1424
  llama_set_n_threads(ctx, n_threads, n_threads);
1458
1425
 
1459
- const llama_model * model = llama_get_model(ctx);
1460
- const int32_t n_vocab = llama_n_vocab(model);
1426
+ const llama_model * model = llama_get_model(ctx);
1427
+ const int32_t n_vocab = llama_n_vocab(model);
1461
1428
 
1462
1429
  llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1463
1430
 
1464
1431
  for (int i = 0; i < n_gen; i++) {
1465
- llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
1432
+ llama_decode(ctx, llama_batch_get_one(&token, 1));
1466
1433
  llama_synchronize(ctx);
1467
1434
  token = std::rand() % n_vocab;
1468
1435
  }
@@ -1510,6 +1477,17 @@ int main(int argc, char ** argv) {
1510
1477
 
1511
1478
  cmd_params params = parse_cmd_params(argc, argv);
1512
1479
 
1480
+ // initialize backends
1481
+ ggml_backend_load_all();
1482
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1483
+ if (!cpu_dev) {
1484
+ fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
1485
+ return 1;
1486
+ }
1487
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
1488
+ auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
1489
+ auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
1490
+
1513
1491
  // initialize llama.cpp
1514
1492
  if (!params.verbose) {
1515
1493
  llama_log_set(llama_null_log_callback, NULL);
@@ -1520,7 +1498,7 @@ int main(int argc, char ** argv) {
1520
1498
  set_process_priority(params.prio);
1521
1499
 
1522
1500
  // initialize printer
1523
- std::unique_ptr<printer> p = create_printer(params.output_format);
1501
+ std::unique_ptr<printer> p = create_printer(params.output_format);
1524
1502
  std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
1525
1503
 
1526
1504
  if (p) {
@@ -1535,15 +1513,15 @@ int main(int argc, char ** argv) {
1535
1513
 
1536
1514
  std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
1537
1515
 
1538
- llama_model * lmodel = nullptr;
1516
+ llama_model * lmodel = nullptr;
1539
1517
  const cmd_params_instance * prev_inst = nullptr;
1540
1518
 
1541
- int params_idx = 0;
1519
+ int params_idx = 0;
1542
1520
  auto params_count = params_instances.size();
1543
1521
  for (const auto & inst : params_instances) {
1544
- params_idx ++;
1522
+ params_idx++;
1545
1523
  if (params.progress) {
1546
- fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
1524
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
1547
1525
  }
1548
1526
  // keep the same model between tests when possible
1549
1527
  if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1584,7 +1562,7 @@ int main(int argc, char ** argv) {
1584
1562
  tpp.poll = t.poll;
1585
1563
  tpp.prio = params.prio;
1586
1564
 
1587
- struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
1565
+ struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
1588
1566
  if (!threadpool) {
1589
1567
  fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1590
1568
  exit(1);
@@ -1595,16 +1573,16 @@ int main(int argc, char ** argv) {
1595
1573
  // warmup run
1596
1574
  if (t.n_prompt > 0) {
1597
1575
  if (params.progress) {
1598
- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
1576
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
1599
1577
  }
1600
1578
  //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1601
- test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1579
+ test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1602
1580
  }
1603
1581
  if (t.n_gen > 0) {
1604
1582
  if (params.progress) {
1605
- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
1583
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
1606
1584
  }
1607
- test_gen(ctx, 1, 0, t.n_threads);
1585
+ test_gen(ctx, 1, t.n_threads);
1608
1586
  }
1609
1587
 
1610
1588
  for (int i = 0; i < params.reps; i++) {
@@ -1614,15 +1592,17 @@ int main(int argc, char ** argv) {
1614
1592
 
1615
1593
  if (t.n_prompt > 0) {
1616
1594
  if (params.progress) {
1617
- fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1595
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
1596
+ i + 1, params.reps);
1618
1597
  }
1619
- test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1598
+ test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1620
1599
  }
1621
1600
  if (t.n_gen > 0) {
1622
1601
  if (params.progress) {
1623
- fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1602
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
1603
+ i + 1, params.reps);
1624
1604
  }
1625
- test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
1605
+ test_gen(ctx, t.n_gen, t.n_threads);
1626
1606
  }
1627
1607
 
1628
1608
  uint64_t t_ns = get_time_ns() - t_start;
@@ -1643,7 +1623,7 @@ int main(int argc, char ** argv) {
1643
1623
 
1644
1624
  llama_free(ctx);
1645
1625
 
1646
- ggml_threadpool_free(threadpool);
1626
+ ggml_threadpool_free_fn(threadpool);
1647
1627
  }
1648
1628
 
1649
1629
  llama_free_model(lmodel);