@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -178,22 +178,24 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
178
178
  cmake --build build --config Release
179
179
  ```
180
180
 
181
- The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
181
+ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
182
+
183
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
184
+
185
+ The following compilation options are also available to tweak performance:
182
186
 
183
187
  | Option | Legal values | Default | Description |
184
188
  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
185
- | GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
186
- | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
187
- | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
188
189
  | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
189
190
  | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
190
191
  | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
191
- | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
192
192
  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
193
193
  | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
194
194
 
195
195
  ### MUSA
196
196
 
197
+ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
198
+
197
199
  - Using `make`:
198
200
  ```bash
199
201
  make GGML_MUSA=1
@@ -205,6 +207,12 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c
205
207
  cmake --build build --config Release
206
208
  ```
207
209
 
210
+ The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
211
+
212
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
213
+
214
+ Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
215
+
208
216
  ### hipBLAS
209
217
 
210
218
  This provides BLAS acceleration on HIP-supported AMD GPUs.
@@ -218,7 +226,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
218
226
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
219
227
  ```bash
220
228
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
221
- cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
229
+ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
222
230
  && cmake --build build --config Release -- -j 16
223
231
  ```
224
232
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@@ -235,7 +243,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
235
243
  ```bash
236
244
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
237
245
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
238
- cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
246
+ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
239
247
  && cmake --build build -- -j 16
240
248
  ```
241
249
 
@@ -247,7 +255,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
247
255
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
248
256
  ```bash
249
257
  set PATH=%HIP_PATH%\bin;%PATH%
250
- cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
258
+ cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
251
259
  cmake --build build
252
260
  ```
253
261
  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -256,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
256
264
 
257
265
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
258
266
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
259
- The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
260
-
261
- | Option | Legal values | Default | Description |
262
- |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
263
- | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
264
- | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
265
- | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
266
267
 
267
268
  ### Vulkan
268
269
 
@@ -270,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,
270
271
 
271
272
  #### w64devkit
272
273
 
273
- Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
274
+ Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
274
275
 
275
- Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
276
+ Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
276
277
 
277
278
  Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
278
279
  ```sh
@@ -290,6 +291,29 @@ EOF
290
291
  ```
291
292
  Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
292
293
 
294
+ #### Git Bash MINGW64
295
+
296
+ Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
297
+
298
+ Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
299
+
300
+ Download and install [`CMake`](https://cmake.org/download/) with the default settings
301
+
302
+ Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
303
+
304
+ Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
305
+
306
+ ```
307
+ cmake -B build -DGGML_VULKAN=ON
308
+ cmake --build build --config Release
309
+ ```
310
+
311
+ Now you can load the model in conversation mode using `Vulkan`
312
+
313
+ ```
314
+ build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
315
+ ```
316
+
293
317
  #### MSYS2
294
318
  Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
295
319
  ```sh
@@ -348,6 +372,37 @@ cmake --build build --config Release
348
372
  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
349
373
  ```
350
374
 
375
+ ### CANN
376
+ This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
377
+
378
+ For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
379
+
380
+ Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
381
+
382
+ Go to `llama.cpp` directory and build using CMake.
383
+ ```bash
384
+ cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
385
+ cmake --build build --config release
386
+ ```
387
+
388
+ You can test with:
389
+
390
+ `./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
391
+
392
+ If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
393
+ ```bash
394
+ llm_load_tensors: CANN buffer size = 13313.00 MiB
395
+ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
396
+ ```
397
+
398
+ For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
399
+
351
400
  ### Android
352
401
 
353
402
  To read documentation for how to build on Android, [click here](./android.md)
403
+
404
+ ### Arm CPU optimized mulmat kernels
405
+
406
+ Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
407
+
408
+ To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
@@ -13,10 +13,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
13
13
  if (EMSCRIPTEN)
14
14
  else()
15
15
  add_subdirectory(cvector-generator)
16
- add_subdirectory(baby-llama)
17
16
  add_subdirectory(batched-bench)
18
17
  add_subdirectory(batched)
19
- add_subdirectory(benchmark)
20
18
  add_subdirectory(convert-llama2c-to-ggml)
21
19
  add_subdirectory(embedding)
22
20
  add_subdirectory(eval-callback)
@@ -50,6 +48,7 @@ else()
50
48
  endif()
51
49
  add_subdirectory(save-load-state)
52
50
  add_subdirectory(simple)
51
+ add_subdirectory(simple-chat)
53
52
  add_subdirectory(speculative)
54
53
  add_subdirectory(tokenize)
55
54
  endif()
@@ -1,31 +1,30 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <algorithm>
5
- #include <cmath>
6
7
  #include <cstdio>
7
8
  #include <string>
8
9
  #include <vector>
9
10
 
10
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
11
- gpt_params_print_usage(argc, argv, params);
12
-
13
- LOG_TEE("\nexample usage:\n");
14
- LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
15
- LOG_TEE("\n");
11
+ static void print_usage(int, char ** argv) {
12
+ LOG("\nexample usage:\n");
13
+ LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
14
+ LOG("\n");
16
15
  }
17
16
 
18
17
  int main(int argc, char ** argv) {
19
- gpt_params params;
18
+ common_params params;
20
19
 
21
20
  params.prompt = "Hello my name is";
22
21
  params.n_predict = 32;
23
22
 
24
- if (!gpt_params_parse(argc, argv, params)) {
25
- print_usage(argc, argv, params);
23
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
26
24
  return 1;
27
25
  }
28
26
 
27
+ common_init();
29
28
 
30
29
  // number of parallel batches
31
30
  int n_parallel = params.n_parallel;
@@ -40,57 +39,64 @@ int main(int argc, char ** argv) {
40
39
 
41
40
  // initialize the model
42
41
 
43
- llama_model_params model_params = llama_model_params_from_gpt_params(params);
42
+ llama_model_params model_params = common_model_params_to_llama(params);
44
43
 
45
44
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
46
45
 
47
46
  if (model == NULL) {
48
- fprintf(stderr , "%s: error: unable to load model\n" , __func__);
47
+ LOG_ERR("%s: error: unable to load model\n" , __func__);
49
48
  return 1;
50
49
  }
51
50
 
52
51
  // tokenize the prompt
53
52
 
54
53
  std::vector<llama_token> tokens_list;
55
- tokens_list = ::llama_tokenize(model, params.prompt, true);
54
+ tokens_list = common_tokenize(model, params.prompt, true);
56
55
 
57
56
  const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
58
57
 
59
58
  // initialize the context
60
59
 
61
- llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
60
+ llama_context_params ctx_params = common_context_params_to_llama(params);
62
61
 
63
62
  ctx_params.n_ctx = n_kv_req;
64
63
  ctx_params.n_batch = std::max(n_predict, n_parallel);
65
64
 
66
65
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
67
66
 
67
+ auto sparams = llama_sampler_chain_default_params();
68
+
69
+ llama_sampler * smpl = llama_sampler_chain_init(sparams);
70
+
71
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
72
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
73
+ llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
74
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
75
+
68
76
  if (ctx == NULL) {
69
- fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
77
+ LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
70
78
  return 1;
71
79
  }
72
80
 
73
81
  const int n_ctx = llama_n_ctx(ctx);
74
82
 
75
- LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
83
+ LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
76
84
 
77
85
  // make sure the KV cache is big enough to hold all the prompt and generated tokens
78
86
  if (n_kv_req > n_ctx) {
79
- LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
80
- LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
87
+ LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
88
+ LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
81
89
  return 1;
82
90
  }
83
91
 
84
92
  // print the prompt token-by-token
85
93
 
86
- fprintf(stderr, "\n");
94
+ LOG("\n");
87
95
 
88
96
  for (auto id : tokens_list) {
89
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
97
+ LOG("%s", common_token_to_piece(ctx, id).c_str());
90
98
  }
91
99
 
92
- fflush(stderr);
93
-
94
100
  // create a llama_batch
95
101
  // we use this object to submit token data for decoding
96
102
  llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -102,13 +108,13 @@ int main(int argc, char ** argv) {
102
108
 
103
109
  // evaluate the initial prompt
104
110
  for (size_t i = 0; i < tokens_list.size(); ++i) {
105
- llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
111
+ common_batch_add(batch, tokens_list[i], i, seq_ids, false);
106
112
  }
107
113
  GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
108
114
 
109
115
  if (llama_model_has_encoder(model)) {
110
116
  if (llama_encode(ctx, batch)) {
111
- LOG_TEE("%s : failed to eval\n", __func__);
117
+ LOG_ERR("%s : failed to eval\n", __func__);
112
118
  return 1;
113
119
  }
114
120
 
@@ -117,15 +123,15 @@ int main(int argc, char ** argv) {
117
123
  decoder_start_token_id = llama_token_bos(model);
118
124
  }
119
125
 
120
- llama_batch_clear(batch);
121
- llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
126
+ common_batch_clear(batch);
127
+ common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
122
128
  }
123
129
 
124
130
  // llama_decode will output logits only for the last token of the prompt
125
131
  batch.logits[batch.n_tokens - 1] = true;
126
132
 
127
133
  if (llama_decode(ctx, batch) != 0) {
128
- LOG_TEE("%s: llama_decode() failed\n", __func__);
134
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
129
135
  return 1;
130
136
  }
131
137
 
@@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
136
142
  //}
137
143
 
138
144
  if (n_parallel > 1) {
139
- LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
145
+ LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
140
146
  }
141
147
 
142
148
  // main loop
@@ -155,7 +161,7 @@ int main(int argc, char ** argv) {
155
161
 
156
162
  while (n_cur <= n_predict) {
157
163
  // prepare the next batch
158
- llama_batch_clear(batch);
164
+ common_batch_clear(batch);
159
165
 
160
166
  // sample the next token for each parallel sequence / stream
161
167
  for (int32_t i = 0; i < n_parallel; ++i) {
@@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
164
170
  continue;
165
171
  }
166
172
 
167
- auto n_vocab = llama_n_vocab(model);
168
- auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
169
-
170
- std::vector<llama_token_data> candidates;
171
- candidates.reserve(n_vocab);
172
-
173
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
174
- candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
175
- }
176
-
177
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
178
-
179
- const int top_k = 40;
180
- const float top_p = 0.9f;
181
- const float temp = 0.4f;
182
-
183
- llama_sample_top_k(ctx, &candidates_p, top_k, 1);
184
- llama_sample_top_p(ctx, &candidates_p, top_p, 1);
185
- llama_sample_temp (ctx, &candidates_p, temp);
186
-
187
- const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
188
-
189
- //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
173
+ const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
190
174
 
191
175
  // is it an end of generation? -> mark the stream as finished
192
176
  if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
193
177
  i_batch[i] = -1;
194
- LOG_TEE("\n");
178
+ LOG("\n");
195
179
  if (n_parallel > 1) {
196
- LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
180
+ LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
197
181
  }
198
182
 
199
183
  continue;
@@ -201,16 +185,15 @@ int main(int argc, char ** argv) {
201
185
 
202
186
  // if there is only one stream, we print immediately to stdout
203
187
  if (n_parallel == 1) {
204
- LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
205
- fflush(stdout);
188
+ LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
206
189
  }
207
190
 
208
- streams[i] += llama_token_to_piece(ctx, new_token_id);
191
+ streams[i] += common_token_to_piece(ctx, new_token_id);
209
192
 
210
193
  i_batch[i] = batch.n_tokens;
211
194
 
212
195
  // push this new token for next evaluation
213
- llama_batch_add(batch, new_token_id, n_cur, { i }, true);
196
+ common_batch_add(batch, new_token_id, n_cur, { i }, true);
214
197
 
215
198
  n_decode += 1;
216
199
  }
@@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
224
207
 
225
208
  // evaluate the current batch with the transformer model
226
209
  if (llama_decode(ctx, batch)) {
227
- fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
210
+ LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
228
211
  return 1;
229
212
  }
230
213
  }
231
214
 
232
- LOG_TEE("\n");
233
-
234
215
  if (n_parallel > 1) {
235
- LOG_TEE("\n");
216
+ LOG("\n");
236
217
 
237
218
  for (int32_t i = 0; i < n_parallel; ++i) {
238
- LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
219
+ LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
239
220
  }
240
221
  }
241
222
 
242
223
  const auto t_main_end = ggml_time_us();
243
224
 
244
- LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
225
+ LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
245
226
  __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
246
227
 
247
- llama_print_timings(ctx);
228
+ LOG("\n");
229
+ llama_perf_sampler_print(smpl);
230
+ llama_perf_context_print(ctx);
248
231
 
249
232
  fprintf(stderr, "\n");
250
233
 
251
234
  llama_batch_free(batch);
252
235
 
236
+ llama_sampler_free(smpl);
253
237
  llama_free(ctx);
254
238
  llama_free_model(model);
255
239
 
@@ -1,49 +1,28 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <algorithm>
5
- #include <cmath>
6
7
  #include <cstdio>
7
8
  #include <string>
8
9
  #include <vector>
9
10
 
10
- // mutates the input string
11
- static std::vector<int> parse_list(char * p) {
12
- std::vector<int> ret;
13
-
14
- char * q = p;
15
-
16
- while (*p) {
17
- if (*p == ',') {
18
- *p = '\0';
19
- ret.push_back(std::atoi(q));
20
- q = p + 1;
21
- }
22
-
23
- ++p;
24
- }
25
-
26
- ret.push_back(std::atoi(q));
27
-
28
- return ret;
29
- }
30
-
31
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
32
- gpt_params_print_usage(argc, argv, params);
33
-
34
- LOG_TEE("\nexample usage:\n");
35
- LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
36
- LOG_TEE("\n");
11
+ static void print_usage(int, char ** argv) {
12
+ LOG("\nexample usage:\n");
13
+ LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
14
+ LOG("\n");
37
15
  }
38
16
 
39
17
  int main(int argc, char ** argv) {
40
- gpt_params params;
18
+ common_params params;
41
19
 
42
- if (!gpt_params_parse(argc, argv, params)) {
43
- print_usage(argc, argv, params);
20
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
44
21
  return 1;
45
22
  }
46
23
 
24
+ common_init();
25
+
47
26
  int is_pp_shared = params.is_pp_shared;
48
27
 
49
28
  std::vector<int> n_pp = params.n_pp;
@@ -57,7 +36,7 @@ int main(int argc, char ** argv) {
57
36
 
58
37
  // initialize the model
59
38
 
60
- llama_model_params model_params = llama_model_params_from_gpt_params(params);
39
+ llama_model_params model_params = common_model_params_to_llama(params);
61
40
 
62
41
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
63
42
 
@@ -66,10 +45,10 @@ int main(int argc, char ** argv) {
66
45
  return 1;
67
46
  }
68
47
 
69
- llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
48
+ llama_context_params ctx_params = common_context_params_to_llama(params);
70
49
 
71
50
  // ensure enough sequences are available
72
- ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
51
+ ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
73
52
 
74
53
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
75
54
 
@@ -95,12 +74,11 @@ int main(int argc, char ** argv) {
95
74
  batch.n_seq_id + i,
96
75
  batch.seq_id + i,
97
76
  batch.logits + i,
98
- 0, 0, 0, // unused
99
77
  };
100
78
 
101
79
  const int ret = llama_decode(ctx, batch_view);
102
80
  if (ret != 0) {
103
- LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
81
+ LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
104
82
  return false;
105
83
  }
106
84
 
@@ -113,21 +91,22 @@ int main(int argc, char ** argv) {
113
91
  // warm up
114
92
  {
115
93
  for (int i = 0; i < 16; ++i) {
116
- llama_batch_add(batch, 0, i, { 0 }, false);
94
+ common_batch_add(batch, 0, i, { 0 }, false);
117
95
  }
118
96
 
119
97
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
120
- LOG_TEE("%s: llama_decode() failed\n", __func__);
98
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
121
99
  return 1;
122
100
  }
123
101
  }
124
102
 
125
- LOG_TEE("\n");
126
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
127
- LOG_TEE("\n");
128
-
129
- LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
130
- LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
103
+ if (!params.batched_bench_output_jsonl) {
104
+ LOG("\n");
105
+ LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
106
+ LOG("\n");
107
+ LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
108
+ LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
109
+ }
131
110
 
132
111
  for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
133
112
  for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -142,11 +121,11 @@ int main(int argc, char ** argv) {
142
121
  continue;
143
122
  }
144
123
 
145
- llama_batch_clear(batch);
124
+ common_batch_clear(batch);
146
125
 
147
126
  for (int i = 0; i < pp; ++i) {
148
127
  for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
149
- llama_batch_add(batch, 0, i, { j }, false);
128
+ common_batch_add(batch, 0, i, { j }, false);
150
129
  }
151
130
  }
152
131
  batch.logits[batch.n_tokens - 1] = true;
@@ -156,7 +135,7 @@ int main(int argc, char ** argv) {
156
135
  llama_kv_cache_clear(ctx);
157
136
 
158
137
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
159
- LOG_TEE("%s: llama_decode() failed\n", __func__);
138
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
160
139
  return 1;
161
140
  }
162
141
 
@@ -171,14 +150,14 @@ int main(int argc, char ** argv) {
171
150
  const auto t_tg_start = ggml_time_us();
172
151
 
173
152
  for (int i = 0; i < tg; ++i) {
174
- llama_batch_clear(batch);
153
+ common_batch_clear(batch);
175
154
 
176
155
  for (int j = 0; j < pl; ++j) {
177
- llama_batch_add(batch, 0, pp + i, { j }, true);
156
+ common_batch_add(batch, 0, pp + i, { j }, true);
178
157
  }
179
158
 
180
159
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
181
- LOG_TEE("%s: llama_decode() failed\n", __func__);
160
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
182
161
  return 1;
183
162
  }
184
163
  }
@@ -195,12 +174,22 @@ int main(int argc, char ** argv) {
195
174
  const float speed_tg = pl*tg / t_tg;
196
175
  const float speed = n_kv / t;
197
176
 
198
- LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
177
+ if(params.batched_bench_output_jsonl) {
178
+ LOG(
179
+ "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
180
+ "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
181
+ n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
182
+ pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
183
+ );
184
+ } else {
185
+ LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
186
+ }
199
187
  }
200
188
  }
201
189
  }
202
190
 
203
- llama_print_timings(ctx);
191
+ LOG("\n");
192
+ llama_perf_context_print(ctx);
204
193
 
205
194
  llama_batch_free(batch);
206
195
 
@@ -209,7 +198,7 @@ int main(int argc, char ** argv) {
209
198
 
210
199
  llama_backend_free();
211
200
 
212
- fprintf(stderr, "\n\n");
201
+ LOG("\n\n");
213
202
 
214
203
  return 0;
215
204
  }