@fugood/llama.node 0.6.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +45 -7
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  27. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  29. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  30. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  31. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  32. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  36. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  37. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  38. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  39. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  40. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  41. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  42. package/src/llama.cpp/cmake/common.cmake +0 -35
  43. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  44. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  45. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  46. package/src/llama.cpp/docs/build.md +0 -561
  47. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  48. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  49. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  50. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  53. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  54. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  56. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  57. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  58. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  59. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  60. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  61. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  62. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  63. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  64. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  68. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  70. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  71. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  72. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  73. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  74. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  75. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  76. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  77. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  79. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  80. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  81. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  82. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  83. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  84. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  85. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  86. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  87. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  88. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  89. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  90. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  91. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  92. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  93. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  94. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  96. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  99. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  100. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  101. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  102. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  103. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  104. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  105. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  106. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  107. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  108. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  109. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  110. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  111. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  112. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  113. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  114. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  115. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  123. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  125. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  127. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  128. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  129. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  130. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  131. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  132. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  135. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  136. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  137. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  138. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  139. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  140. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  141. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  142. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  143. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  144. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  145. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  146. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  147. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  148. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  149. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  150. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  151. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  153. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  155. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  157. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  159. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  161. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  163. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  167. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  169. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  170. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  172. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  173. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  175. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  177. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  183. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  185. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  186. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  187. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  189. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  191. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  195. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  198. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  199. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  200. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  204. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  205. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  206. package/src/llama.cpp/models/.editorconfig +0 -1
  207. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  208. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  212. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  214. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  217. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  225. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  228. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  231. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  233. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  237. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  240. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  242. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  245. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  248. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  256. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  261. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  262. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  263. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  264. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  265. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  266. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  267. package/src/llama.cpp/prompts/assistant.txt +0 -31
  268. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  269. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  270. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  271. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  273. package/src/llama.cpp/prompts/chat.txt +0 -28
  274. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  275. package/src/llama.cpp/prompts/dan.txt +0 -1
  276. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  277. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  278. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  279. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  280. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  281. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  284. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  285. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  286. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  287. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  289. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  290. package/src/llama.cpp/requirements.txt +0 -13
  291. package/src/llama.cpp/scripts/build-info.sh +0 -30
  292. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  293. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  294. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  295. package/src/llama.cpp/tests/get-model.cpp +0 -21
  296. package/src/llama.cpp/tests/get-model.h +0 -2
  297. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  298. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  299. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  300. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  301. package/src/llama.cpp/tests/test-c.c +0 -7
  302. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  303. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  304. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  305. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  306. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  307. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  308. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  309. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  310. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  311. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  312. package/src/llama.cpp/tests/test-log.cpp +0 -39
  313. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  314. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  315. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  316. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  317. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  318. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  319. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  320. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  321. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  322. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  323. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  324. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  325. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  326. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  327. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  328. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  329. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  330. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  331. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  332. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  333. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  334. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  335. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  336. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  337. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  338. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  339. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  340. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  341. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  342. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  343. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  344. package/src/llama.cpp/tools/main/main.cpp +0 -977
  345. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  346. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  347. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  348. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  349. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  350. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  351. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  353. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  354. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  355. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  356. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  357. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  358. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  360. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  361. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  362. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  363. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  364. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  365. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  367. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  368. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  369. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  370. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  371. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  372. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  373. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  374. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  375. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  376. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  377. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,2024 +0,0 @@
1
- #include <algorithm>
2
- #include <array>
3
- #include <cassert>
4
- #include <chrono>
5
- #include <cinttypes>
6
- #include <clocale>
7
- #include <cmath>
8
- #include <cstdio>
9
- #include <cstdlib>
10
- #include <cstring>
11
- #include <ctime>
12
- #include <iterator>
13
- #include <map>
14
- #include <numeric>
15
- #include <regex>
16
- #include <sstream>
17
- #include <string>
18
- #include <thread>
19
- #include <vector>
20
-
21
- #include "common.h"
22
- #include "ggml.h"
23
- #include "llama.h"
24
-
25
- #ifdef _WIN32
26
- # define WIN32_LEAN_AND_MEAN
27
- # ifndef NOMINMAX
28
- # define NOMINMAX
29
- # endif
30
- # include <windows.h>
31
- #endif
32
-
33
- // utils
34
- static uint64_t get_time_ns() {
35
- using clock = std::chrono::high_resolution_clock;
36
- return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
37
- }
38
-
39
- static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
40
- if (a.pattern != b.pattern) {
41
- // cString comparison that may be null
42
- if (a.pattern == nullptr || b.pattern == nullptr) {
43
- return false;
44
- }
45
- if (strcmp(a.pattern, b.pattern) != 0) {
46
- return false;
47
- }
48
- }
49
- if (a.buft != b.buft) {
50
- return false;
51
- }
52
- return true;
53
- }
54
-
55
- static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
56
- if (a.size() != b.size()) {
57
- return false;
58
- }
59
- for (size_t i = 0; i < a.size(); i++) {
60
- if (!tensor_buft_override_equal(a[i], b[i])) {
61
- return false;
62
- }
63
- }
64
- return true;
65
- }
66
-
67
- static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
68
- if (a.size() != b.size()) {
69
- return false;
70
- }
71
- for (size_t i = 0; i < a.size(); i++) {
72
- if (!vec_tensor_buft_override_equal(a[i], b[i])) {
73
- return false;
74
- }
75
- }
76
- return true;
77
- }
78
-
79
- template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
80
- std::ostringstream str;
81
- for (size_t i = 0; i < values.size(); i++) {
82
- str << values[i];
83
- if (i < values.size() - 1) {
84
- str << delim;
85
- }
86
- }
87
- return str.str();
88
- }
89
-
90
- template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
91
- std::vector<std::string> str_values;
92
- std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
93
- return str_values;
94
- }
95
-
96
- template <typename T> static T avg(const std::vector<T> & v) {
97
- if (v.empty()) {
98
- return 0;
99
- }
100
- T sum = std::accumulate(v.begin(), v.end(), T(0));
101
- return sum / (T) v.size();
102
- }
103
-
104
- template <typename T> static T stdev(const std::vector<T> & v) {
105
- if (v.size() <= 1) {
106
- return 0;
107
- }
108
- T mean = avg(v);
109
- T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
110
- T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
111
- return stdev;
112
- }
113
-
114
- static std::string get_cpu_info() {
115
- std::vector<std::string> cpu_list;
116
- for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
117
- auto * dev = ggml_backend_dev_get(i);
118
- auto dev_type = ggml_backend_dev_type(dev);
119
- if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
120
- cpu_list.push_back(ggml_backend_dev_description(dev));
121
- }
122
- }
123
- return join(cpu_list, ", ");
124
- }
125
-
126
- static std::string get_gpu_info() {
127
- std::vector<std::string> gpu_list;
128
- for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
129
- auto * dev = ggml_backend_dev_get(i);
130
- auto dev_type = ggml_backend_dev_type(dev);
131
- if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
132
- gpu_list.push_back(ggml_backend_dev_description(dev));
133
- }
134
- }
135
- return join(gpu_list, ", ");
136
- }
137
-
138
- // command line params
139
- enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
140
-
141
- static const char * output_format_str(output_formats format) {
142
- switch (format) {
143
- case NONE:
144
- return "none";
145
- case CSV:
146
- return "csv";
147
- case JSON:
148
- return "json";
149
- case JSONL:
150
- return "jsonl";
151
- case MARKDOWN:
152
- return "md";
153
- case SQL:
154
- return "sql";
155
- default:
156
- GGML_ABORT("invalid output format");
157
- }
158
- }
159
-
160
- static bool output_format_from_str(const std::string & s, output_formats & format) {
161
- if (s == "none") {
162
- format = NONE;
163
- } else if (s == "csv") {
164
- format = CSV;
165
- } else if (s == "json") {
166
- format = JSON;
167
- } else if (s == "jsonl") {
168
- format = JSONL;
169
- } else if (s == "md") {
170
- format = MARKDOWN;
171
- } else if (s == "sql") {
172
- format = SQL;
173
- } else {
174
- return false;
175
- }
176
- return true;
177
- }
178
-
179
- static const char * split_mode_str(llama_split_mode mode) {
180
- switch (mode) {
181
- case LLAMA_SPLIT_MODE_NONE:
182
- return "none";
183
- case LLAMA_SPLIT_MODE_LAYER:
184
- return "layer";
185
- case LLAMA_SPLIT_MODE_ROW:
186
- return "row";
187
- default:
188
- GGML_ABORT("invalid split mode");
189
- }
190
- }
191
-
192
- static std::string pair_str(const std::pair<int, int> & p) {
193
- static char buf[32];
194
- snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
195
- return buf;
196
- }
197
-
198
- static std::vector<int> parse_int_range(const std::string & s) {
199
- // first[-last[(+|*)step]]
200
- std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
201
-
202
- std::smatch match;
203
- std::string::const_iterator search_start(s.cbegin());
204
- std::vector<int> result;
205
- while (std::regex_search(search_start, s.cend(), match, range_regex)) {
206
- int first = std::stoi(match[1]);
207
- int last = match[2].matched ? std::stoi(match[2]) : first;
208
- char op = match[3].matched ? match[3].str()[0] : '+';
209
- int step = match[4].matched ? std::stoi(match[4]) : 1;
210
-
211
- for (int i = first; i <= last;) {
212
- result.push_back(i);
213
-
214
- int prev_i = i;
215
-
216
- if (op == '+') {
217
- i += step;
218
- } else if (op == '*') {
219
- i *= step;
220
- } else {
221
- throw std::invalid_argument("invalid range format");
222
- }
223
-
224
- if (i <= prev_i) {
225
- throw std::invalid_argument("invalid range");
226
- }
227
- }
228
- search_start = match.suffix().first;
229
- }
230
-
231
- if (search_start != s.cend()) {
232
- throw std::invalid_argument("invalid range format");
233
- }
234
-
235
- return result;
236
- }
237
-
238
- struct cmd_params {
239
- std::vector<std::string> model;
240
- std::vector<int> n_prompt;
241
- std::vector<int> n_gen;
242
- std::vector<std::pair<int, int>> n_pg;
243
- std::vector<int> n_depth;
244
- std::vector<int> n_batch;
245
- std::vector<int> n_ubatch;
246
- std::vector<ggml_type> type_k;
247
- std::vector<ggml_type> type_v;
248
- std::vector<float> defrag_thold;
249
- std::vector<int> n_threads;
250
- std::vector<std::string> cpu_mask;
251
- std::vector<bool> cpu_strict;
252
- std::vector<int> poll;
253
- std::vector<int> n_gpu_layers;
254
- std::vector<std::string> rpc_servers;
255
- std::vector<llama_split_mode> split_mode;
256
- std::vector<int> main_gpu;
257
- std::vector<bool> no_kv_offload;
258
- std::vector<bool> flash_attn;
259
- std::vector<std::vector<float>> tensor_split;
260
- std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
261
- std::vector<bool> use_mmap;
262
- std::vector<bool> embeddings;
263
- std::vector<bool> no_op_offload;
264
- ggml_numa_strategy numa;
265
- int reps;
266
- ggml_sched_priority prio;
267
- int delay;
268
- bool verbose;
269
- bool progress;
270
- output_formats output_format;
271
- output_formats output_format_stderr;
272
- };
273
-
274
- static const cmd_params cmd_params_defaults = {
275
- /* model */ { "models/7B/ggml-model-q4_0.gguf" },
276
- /* n_prompt */ { 512 },
277
- /* n_gen */ { 128 },
278
- /* n_pg */ {},
279
- /* n_depth */ { 0 },
280
- /* n_batch */ { 2048 },
281
- /* n_ubatch */ { 512 },
282
- /* type_k */ { GGML_TYPE_F16 },
283
- /* type_v */ { GGML_TYPE_F16 },
284
- /* defrag_thold */ { -1.0f },
285
- /* n_threads */ { cpu_get_num_math() },
286
- /* cpu_mask */ { "0x0" },
287
- /* cpu_strict */ { false },
288
- /* poll */ { 50 },
289
- /* n_gpu_layers */ { 99 },
290
- /* rpc_servers */ { "" },
291
- /* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
292
- /* main_gpu */ { 0 },
293
- /* no_kv_offload */ { false },
294
- /* flash_attn */ { false },
295
- /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
296
- /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
297
- /* use_mmap */ { true },
298
- /* embeddings */ { false },
299
- /* no_op_offload */ { false },
300
- /* numa */ GGML_NUMA_STRATEGY_DISABLED,
301
- /* reps */ 5,
302
- /* prio */ GGML_SCHED_PRIO_NORMAL,
303
- /* delay */ 0,
304
- /* verbose */ false,
305
- /* progress */ false,
306
- /* output_format */ MARKDOWN,
307
- /* output_format_stderr */ NONE,
308
- };
309
-
310
- static void print_usage(int /* argc */, char ** argv) {
311
- printf("usage: %s [options]\n", argv[0]);
312
- printf("\n");
313
- printf("options:\n");
314
- printf(" -h, --help\n");
315
- printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
316
- printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
317
- cmd_params_defaults.reps);
318
- printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
319
- cmd_params_defaults.prio);
320
- printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
321
- cmd_params_defaults.delay);
322
- printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
323
- output_format_str(cmd_params_defaults.output_format));
324
- printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
325
- output_format_str(cmd_params_defaults.output_format_stderr));
326
- printf(" -v, --verbose verbose output\n");
327
- printf(" --progress print test progress indicators\n");
328
- printf("\n");
329
- printf("test parameters:\n");
330
- printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
331
- printf(" -p, --n-prompt <n> (default: %s)\n",
332
- join(cmd_params_defaults.n_prompt, ",").c_str());
333
- printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
334
- printf(" -pg <pp,tg> (default: %s)\n",
335
- join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
336
- printf(" -d, --n-depth <n> (default: %s)\n",
337
- join(cmd_params_defaults.n_depth, ",").c_str());
338
- printf(" -b, --batch-size <n> (default: %s)\n",
339
- join(cmd_params_defaults.n_batch, ",").c_str());
340
- printf(" -ub, --ubatch-size <n> (default: %s)\n",
341
- join(cmd_params_defaults.n_ubatch, ",").c_str());
342
- printf(" -ctk, --cache-type-k <t> (default: %s)\n",
343
- join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
344
- printf(" -ctv, --cache-type-v <t> (default: %s)\n",
345
- join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
346
- printf(" -dt, --defrag-thold <f> (default: %s)\n",
347
- join(cmd_params_defaults.defrag_thold, ",").c_str());
348
- printf(" -t, --threads <n> (default: %s)\n",
349
- join(cmd_params_defaults.n_threads, ",").c_str());
350
- printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
351
- join(cmd_params_defaults.cpu_mask, ",").c_str());
352
- printf(" --cpu-strict <0|1> (default: %s)\n",
353
- join(cmd_params_defaults.cpu_strict, ",").c_str());
354
- printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
355
- printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
356
- join(cmd_params_defaults.n_gpu_layers, ",").c_str());
357
- if (llama_supports_rpc()) {
358
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
359
- join(cmd_params_defaults.rpc_servers, ",").c_str());
360
- }
361
- printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
362
- join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
363
- printf(" -mg, --main-gpu <i> (default: %s)\n",
364
- join(cmd_params_defaults.main_gpu, ",").c_str());
365
- printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
366
- join(cmd_params_defaults.no_kv_offload, ",").c_str());
367
- printf(" -fa, --flash-attn <0|1> (default: %s)\n",
368
- join(cmd_params_defaults.flash_attn, ",").c_str());
369
- printf(" -mmp, --mmap <0|1> (default: %s)\n",
370
- join(cmd_params_defaults.use_mmap, ",").c_str());
371
- printf(" -embd, --embeddings <0|1> (default: %s)\n",
372
- join(cmd_params_defaults.embeddings, ",").c_str());
373
- printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
374
- printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
375
- printf(" (default: disabled)\n");
376
- printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
377
- printf("\n");
378
- printf(
379
- "Multiple values can be given for each parameter by separating them with ','\n"
380
- "or by specifying the parameter multiple times. Ranges can be given as\n"
381
- "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
382
- }
383
-
384
- static ggml_type ggml_type_from_name(const std::string & s) {
385
- if (s == "f16") {
386
- return GGML_TYPE_F16;
387
- }
388
- if (s == "bf16") {
389
- return GGML_TYPE_BF16;
390
- }
391
- if (s == "q8_0") {
392
- return GGML_TYPE_Q8_0;
393
- }
394
- if (s == "q4_0") {
395
- return GGML_TYPE_Q4_0;
396
- }
397
- if (s == "q4_1") {
398
- return GGML_TYPE_Q4_1;
399
- }
400
- if (s == "q5_0") {
401
- return GGML_TYPE_Q5_0;
402
- }
403
- if (s == "q5_1") {
404
- return GGML_TYPE_Q5_1;
405
- }
406
- if (s == "iq4_nl") {
407
- return GGML_TYPE_IQ4_NL;
408
- }
409
-
410
- return GGML_TYPE_COUNT;
411
- }
412
-
413
- static cmd_params parse_cmd_params(int argc, char ** argv) {
414
- cmd_params params;
415
- std::string arg;
416
- bool invalid_param = false;
417
- const std::string arg_prefix = "--";
418
- const char split_delim = ',';
419
-
420
- params.verbose = cmd_params_defaults.verbose;
421
- params.output_format = cmd_params_defaults.output_format;
422
- params.output_format_stderr = cmd_params_defaults.output_format_stderr;
423
- params.reps = cmd_params_defaults.reps;
424
- params.numa = cmd_params_defaults.numa;
425
- params.prio = cmd_params_defaults.prio;
426
- params.delay = cmd_params_defaults.delay;
427
- params.progress = cmd_params_defaults.progress;
428
-
429
- for (int i = 1; i < argc; i++) {
430
- arg = argv[i];
431
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
432
- std::replace(arg.begin(), arg.end(), '_', '-');
433
- }
434
-
435
- try {
436
- if (arg == "-h" || arg == "--help") {
437
- print_usage(argc, argv);
438
- exit(0);
439
- } else if (arg == "-m" || arg == "--model") {
440
- if (++i >= argc) {
441
- invalid_param = true;
442
- break;
443
- }
444
- auto p = string_split<std::string>(argv[i], split_delim);
445
- params.model.insert(params.model.end(), p.begin(), p.end());
446
- } else if (arg == "-p" || arg == "--n-prompt") {
447
- if (++i >= argc) {
448
- invalid_param = true;
449
- break;
450
- }
451
- auto p = parse_int_range(argv[i]);
452
- params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
453
- } else if (arg == "-n" || arg == "--n-gen") {
454
- if (++i >= argc) {
455
- invalid_param = true;
456
- break;
457
- }
458
- auto p = parse_int_range(argv[i]);
459
- params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
460
- } else if (arg == "-pg") {
461
- if (++i >= argc) {
462
- invalid_param = true;
463
- break;
464
- }
465
- auto p = string_split<std::string>(argv[i], ',');
466
- if (p.size() != 2) {
467
- invalid_param = true;
468
- break;
469
- }
470
- params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
471
- } else if (arg == "-d" || arg == "--n-depth") {
472
- if (++i >= argc) {
473
- invalid_param = true;
474
- break;
475
- }
476
- auto p = parse_int_range(argv[i]);
477
- params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
478
- } else if (arg == "-b" || arg == "--batch-size") {
479
- if (++i >= argc) {
480
- invalid_param = true;
481
- break;
482
- }
483
- auto p = parse_int_range(argv[i]);
484
- params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
485
- } else if (arg == "-ub" || arg == "--ubatch-size") {
486
- if (++i >= argc) {
487
- invalid_param = true;
488
- break;
489
- }
490
- auto p = parse_int_range(argv[i]);
491
- params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
492
- } else if (arg == "-ctk" || arg == "--cache-type-k") {
493
- if (++i >= argc) {
494
- invalid_param = true;
495
- break;
496
- }
497
- auto p = string_split<std::string>(argv[i], split_delim);
498
-
499
- std::vector<ggml_type> types;
500
- for (const auto & t : p) {
501
- ggml_type gt = ggml_type_from_name(t);
502
- if (gt == GGML_TYPE_COUNT) {
503
- invalid_param = true;
504
- break;
505
- }
506
- types.push_back(gt);
507
- }
508
- if (invalid_param) {
509
- break;
510
- }
511
- params.type_k.insert(params.type_k.end(), types.begin(), types.end());
512
- } else if (arg == "-ctv" || arg == "--cache-type-v") {
513
- if (++i >= argc) {
514
- invalid_param = true;
515
- break;
516
- }
517
- auto p = string_split<std::string>(argv[i], split_delim);
518
-
519
- std::vector<ggml_type> types;
520
- for (const auto & t : p) {
521
- ggml_type gt = ggml_type_from_name(t);
522
- if (gt == GGML_TYPE_COUNT) {
523
- invalid_param = true;
524
- break;
525
- }
526
- types.push_back(gt);
527
- }
528
- if (invalid_param) {
529
- break;
530
- }
531
- params.type_v.insert(params.type_v.end(), types.begin(), types.end());
532
- } else if (arg == "-dt" || arg == "--defrag-thold") {
533
- if (++i >= argc) {
534
- invalid_param = true;
535
- break;
536
- }
537
- auto p = string_split<float>(argv[i], split_delim);
538
- params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
539
- } else if (arg == "-t" || arg == "--threads") {
540
- if (++i >= argc) {
541
- invalid_param = true;
542
- break;
543
- }
544
- auto p = parse_int_range(argv[i]);
545
- params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
546
- } else if (arg == "-C" || arg == "--cpu-mask") {
547
- if (++i >= argc) {
548
- invalid_param = true;
549
- break;
550
- }
551
- auto p = string_split<std::string>(argv[i], split_delim);
552
- params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
553
- } else if (arg == "--cpu-strict") {
554
- if (++i >= argc) {
555
- invalid_param = true;
556
- break;
557
- }
558
- auto p = string_split<bool>(argv[i], split_delim);
559
- params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
560
- } else if (arg == "--poll") {
561
- if (++i >= argc) {
562
- invalid_param = true;
563
- break;
564
- }
565
- auto p = parse_int_range(argv[i]);
566
- params.poll.insert(params.poll.end(), p.begin(), p.end());
567
- } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
568
- if (++i >= argc) {
569
- invalid_param = true;
570
- break;
571
- }
572
- auto p = parse_int_range(argv[i]);
573
- params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
574
- } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
575
- if (++i >= argc) {
576
- invalid_param = true;
577
- break;
578
- }
579
- params.rpc_servers.push_back(argv[i]);
580
- } else if (arg == "-sm" || arg == "--split-mode") {
581
- if (++i >= argc) {
582
- invalid_param = true;
583
- break;
584
- }
585
- auto p = string_split<std::string>(argv[i], split_delim);
586
-
587
- std::vector<llama_split_mode> modes;
588
- for (const auto & m : p) {
589
- llama_split_mode mode;
590
- if (m == "none") {
591
- mode = LLAMA_SPLIT_MODE_NONE;
592
- } else if (m == "layer") {
593
- mode = LLAMA_SPLIT_MODE_LAYER;
594
- } else if (m == "row") {
595
- mode = LLAMA_SPLIT_MODE_ROW;
596
- } else {
597
- invalid_param = true;
598
- break;
599
- }
600
- modes.push_back(mode);
601
- }
602
- if (invalid_param) {
603
- break;
604
- }
605
- params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
606
- } else if (arg == "-mg" || arg == "--main-gpu") {
607
- if (++i >= argc) {
608
- invalid_param = true;
609
- break;
610
- }
611
- params.main_gpu = parse_int_range(argv[i]);
612
- } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
613
- if (++i >= argc) {
614
- invalid_param = true;
615
- break;
616
- }
617
- auto p = string_split<bool>(argv[i], split_delim);
618
- params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
619
- } else if (arg == "--numa") {
620
- if (++i >= argc) {
621
- invalid_param = true;
622
- break;
623
- }
624
- std::string value(argv[i]);
625
- if (value == "distribute" || value == "") {
626
- params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
627
- } else if (value == "isolate") {
628
- params.numa = GGML_NUMA_STRATEGY_ISOLATE;
629
- } else if (value == "numactl") {
630
- params.numa = GGML_NUMA_STRATEGY_NUMACTL;
631
- } else {
632
- invalid_param = true;
633
- break;
634
- }
635
- } else if (arg == "-fa" || arg == "--flash-attn") {
636
- if (++i >= argc) {
637
- invalid_param = true;
638
- break;
639
- }
640
- auto p = string_split<bool>(argv[i], split_delim);
641
- params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
642
- } else if (arg == "-mmp" || arg == "--mmap") {
643
- if (++i >= argc) {
644
- invalid_param = true;
645
- break;
646
- }
647
- auto p = string_split<bool>(argv[i], split_delim);
648
- params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
649
- } else if (arg == "-embd" || arg == "--embeddings") {
650
- if (++i >= argc) {
651
- invalid_param = true;
652
- break;
653
- }
654
- auto p = string_split<bool>(argv[i], split_delim);
655
- params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
656
- } else if (arg == "-nopo" || arg == "--no-op-offload") {
657
- if (++i >= argc) {
658
- invalid_param = true;
659
- break;
660
- }
661
- auto p = string_split<bool>(argv[i], split_delim);
662
- params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
663
- } else if (arg == "-ts" || arg == "--tensor-split") {
664
- if (++i >= argc) {
665
- invalid_param = true;
666
- break;
667
- }
668
- for (auto ts : string_split<std::string>(argv[i], split_delim)) {
669
- // split string by ; and /
670
- const std::regex regex{ R"([;/]+)" };
671
- std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
672
- std::vector<std::string> split_arg{ it, {} };
673
- GGML_ASSERT(split_arg.size() <= llama_max_devices());
674
-
675
- std::vector<float> tensor_split(llama_max_devices());
676
- for (size_t i = 0; i < llama_max_devices(); ++i) {
677
- if (i < split_arg.size()) {
678
- tensor_split[i] = std::stof(split_arg[i]);
679
- } else {
680
- tensor_split[i] = 0.0f;
681
- }
682
- }
683
- params.tensor_split.push_back(tensor_split);
684
- }
685
- } else if (arg == "-ot" || arg == "--override-tensor") {
686
- if (++i >= argc) {
687
- invalid_param = true;
688
- break;
689
- }
690
- auto * value = argv[i];
691
- /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
692
- if (buft_list.empty()) {
693
- // enumerate all the devices and add their buffer types to the list
694
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
695
- auto * dev = ggml_backend_dev_get(i);
696
- auto * buft = ggml_backend_dev_buffer_type(dev);
697
- if (buft) {
698
- buft_list[ggml_backend_buft_name(buft)] = buft;
699
- }
700
- }
701
- }
702
- auto override_group_span_len = std::strcspn(value, ",");
703
- bool last_group = false;
704
- do {
705
- if (override_group_span_len == 0) {
706
- // Adds an empty override-tensors for an empty span
707
- params.tensor_buft_overrides.push_back({{}});
708
- if (value[override_group_span_len] == '\0') {
709
- value = &value[override_group_span_len];
710
- last_group = true;
711
- } else {
712
- value = &value[override_group_span_len + 1];
713
- override_group_span_len = std::strcspn(value, ",");
714
- }
715
- continue;
716
- }
717
- // Stamps null terminators into the argv
718
- // value for this option to avoid the
719
- // memory leak present in the implementation
720
- // over in arg.cpp. Acceptable because we
721
- // only parse these args once in this program.
722
- auto * override_group = value;
723
- if (value[override_group_span_len] == '\0') {
724
- value = &value[override_group_span_len];
725
- last_group = true;
726
- } else {
727
- value[override_group_span_len] = '\0';
728
- value = &value[override_group_span_len + 1];
729
- }
730
- std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
731
- auto override_span_len = std::strcspn(override_group, ";");
732
- while (override_span_len > 0) {
733
- auto * override = override_group;
734
- if (override_group[override_span_len] != '\0') {
735
- override_group[override_span_len] = '\0';
736
- override_group = &override_group[override_span_len + 1];
737
- } else {
738
- override_group = &override_group[override_span_len];
739
- }
740
- auto tensor_name_span_len = std::strcspn(override, "=");
741
- if (tensor_name_span_len >= override_span_len) {
742
- invalid_param = true;
743
- break;
744
- }
745
- override[tensor_name_span_len] = '\0';
746
- auto * tensor_name = override;
747
- auto * buffer_type = &override[tensor_name_span_len + 1];
748
- if (buft_list.find(buffer_type) == buft_list.end()) {
749
- printf("error: unrecognized buffer type '%s'\n", buffer_type);
750
- printf("Available buffer types:\n");
751
- for (const auto & it : buft_list) {
752
- printf(" %s\n", ggml_backend_buft_name(it.second));
753
- }
754
- invalid_param = true;
755
- break;
756
- }
757
- group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
758
- override_span_len = std::strcspn(override_group, ";");
759
- }
760
- if (invalid_param) {
761
- break;
762
- }
763
- group_tensor_buft_overrides.push_back({nullptr,nullptr});
764
- params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
765
- override_group_span_len = std::strcspn(value, ",");
766
- } while (!last_group);
767
- } else if (arg == "-r" || arg == "--repetitions") {
768
- if (++i >= argc) {
769
- invalid_param = true;
770
- break;
771
- }
772
- params.reps = std::stoi(argv[i]);
773
- } else if (arg == "--prio") {
774
- if (++i >= argc) {
775
- invalid_param = true;
776
- break;
777
- }
778
- params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
779
- } else if (arg == "--delay") {
780
- if (++i >= argc) {
781
- invalid_param = true;
782
- break;
783
- }
784
- params.delay = std::stoi(argv[i]);
785
- } else if (arg == "-o" || arg == "--output") {
786
- if (++i >= argc) {
787
- invalid_param = true;
788
- break;
789
- }
790
- invalid_param = !output_format_from_str(argv[i], params.output_format);
791
- } else if (arg == "-oe" || arg == "--output-err") {
792
- if (++i >= argc) {
793
- invalid_param = true;
794
- break;
795
- }
796
- invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
797
- } else if (arg == "-v" || arg == "--verbose") {
798
- params.verbose = true;
799
- } else if (arg == "--progress") {
800
- params.progress = true;
801
- } else {
802
- invalid_param = true;
803
- break;
804
- }
805
- } catch (const std::exception & e) {
806
- fprintf(stderr, "error: %s\n", e.what());
807
- invalid_param = true;
808
- break;
809
- }
810
- }
811
-
812
- if (invalid_param) {
813
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
814
- print_usage(argc, argv);
815
- exit(1);
816
- }
817
-
818
- // set defaults
819
- if (params.model.empty()) {
820
- params.model = cmd_params_defaults.model;
821
- }
822
- if (params.n_prompt.empty()) {
823
- params.n_prompt = cmd_params_defaults.n_prompt;
824
- }
825
- if (params.n_gen.empty()) {
826
- params.n_gen = cmd_params_defaults.n_gen;
827
- }
828
- if (params.n_pg.empty()) {
829
- params.n_pg = cmd_params_defaults.n_pg;
830
- }
831
- if (params.n_depth.empty()) {
832
- params.n_depth = cmd_params_defaults.n_depth;
833
- }
834
- if (params.n_batch.empty()) {
835
- params.n_batch = cmd_params_defaults.n_batch;
836
- }
837
- if (params.n_ubatch.empty()) {
838
- params.n_ubatch = cmd_params_defaults.n_ubatch;
839
- }
840
- if (params.type_k.empty()) {
841
- params.type_k = cmd_params_defaults.type_k;
842
- }
843
- if (params.type_v.empty()) {
844
- params.type_v = cmd_params_defaults.type_v;
845
- }
846
- if (params.defrag_thold.empty()) {
847
- params.defrag_thold = cmd_params_defaults.defrag_thold;
848
- }
849
- if (params.n_gpu_layers.empty()) {
850
- params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
851
- }
852
- if (params.rpc_servers.empty()) {
853
- params.rpc_servers = cmd_params_defaults.rpc_servers;
854
- }
855
- if (params.split_mode.empty()) {
856
- params.split_mode = cmd_params_defaults.split_mode;
857
- }
858
- if (params.main_gpu.empty()) {
859
- params.main_gpu = cmd_params_defaults.main_gpu;
860
- }
861
- if (params.no_kv_offload.empty()) {
862
- params.no_kv_offload = cmd_params_defaults.no_kv_offload;
863
- }
864
- if (params.flash_attn.empty()) {
865
- params.flash_attn = cmd_params_defaults.flash_attn;
866
- }
867
- if (params.tensor_split.empty()) {
868
- params.tensor_split = cmd_params_defaults.tensor_split;
869
- }
870
- if (params.tensor_buft_overrides.empty()) {
871
- params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
872
- }
873
- if (params.use_mmap.empty()) {
874
- params.use_mmap = cmd_params_defaults.use_mmap;
875
- }
876
- if (params.embeddings.empty()) {
877
- params.embeddings = cmd_params_defaults.embeddings;
878
- }
879
- if (params.no_op_offload.empty()) {
880
- params.no_op_offload = cmd_params_defaults.no_op_offload;
881
- }
882
- if (params.n_threads.empty()) {
883
- params.n_threads = cmd_params_defaults.n_threads;
884
- }
885
- if (params.cpu_mask.empty()) {
886
- params.cpu_mask = cmd_params_defaults.cpu_mask;
887
- }
888
- if (params.cpu_strict.empty()) {
889
- params.cpu_strict = cmd_params_defaults.cpu_strict;
890
- }
891
- if (params.poll.empty()) {
892
- params.poll = cmd_params_defaults.poll;
893
- }
894
-
895
- return params;
896
- }
897
-
898
- struct cmd_params_instance {
899
- std::string model;
900
- int n_prompt;
901
- int n_gen;
902
- int n_depth;
903
- int n_batch;
904
- int n_ubatch;
905
- ggml_type type_k;
906
- ggml_type type_v;
907
- float defrag_thold;
908
- int n_threads;
909
- std::string cpu_mask;
910
- bool cpu_strict;
911
- int poll;
912
- int n_gpu_layers;
913
- std::string rpc_servers_str;
914
- llama_split_mode split_mode;
915
- int main_gpu;
916
- bool no_kv_offload;
917
- bool flash_attn;
918
- std::vector<float> tensor_split;
919
- std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
920
- bool use_mmap;
921
- bool embeddings;
922
- bool no_op_offload;
923
-
924
- llama_model_params to_llama_mparams() const {
925
- llama_model_params mparams = llama_model_default_params();
926
-
927
- mparams.n_gpu_layers = n_gpu_layers;
928
- if (!rpc_servers_str.empty()) {
929
- auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
930
-
931
- // add RPC devices
932
- if (!rpc_servers.empty()) {
933
- ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
934
- if (!rpc_reg) {
935
- fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
936
- exit(1);
937
- }
938
-
939
- typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
940
- ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
941
- if (!ggml_backend_rpc_add_device_fn) {
942
- fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
943
- exit(1);
944
- }
945
- static std::vector<ggml_backend_dev_t> devices;
946
- devices.clear();
947
- for (const std::string & server : rpc_servers) {
948
- ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
949
- if (dev) {
950
- devices.push_back(dev);
951
- } else {
952
- fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
953
- exit(1);
954
- }
955
- }
956
- devices.push_back(nullptr);
957
- mparams.devices = devices.data();
958
- }
959
- }
960
- mparams.split_mode = split_mode;
961
- mparams.main_gpu = main_gpu;
962
- mparams.tensor_split = tensor_split.data();
963
- mparams.use_mmap = use_mmap;
964
-
965
- if (tensor_buft_overrides.empty()) {
966
- mparams.tensor_buft_overrides = nullptr;
967
- } else {
968
- GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
969
- mparams.tensor_buft_overrides = tensor_buft_overrides.data();
970
- }
971
-
972
- return mparams;
973
- }
974
-
975
- bool equal_mparams(const cmd_params_instance & other) const {
976
- return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
977
- split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
978
- tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
979
- }
980
-
981
- llama_context_params to_llama_cparams() const {
982
- llama_context_params cparams = llama_context_default_params();
983
-
984
- cparams.n_ctx = n_prompt + n_gen + n_depth;
985
- cparams.n_batch = n_batch;
986
- cparams.n_ubatch = n_ubatch;
987
- cparams.type_k = type_k;
988
- cparams.type_v = type_v;
989
- cparams.defrag_thold = defrag_thold;
990
- cparams.offload_kqv = !no_kv_offload;
991
- cparams.flash_attn = flash_attn;
992
- cparams.embeddings = embeddings;
993
- cparams.op_offload = !no_op_offload;
994
- cparams.swa_full = false;
995
-
996
- return cparams;
997
- }
998
- };
999
-
1000
- static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
1001
- std::vector<cmd_params_instance> instances;
1002
-
1003
- // this ordering minimizes the number of times that each model needs to be reloaded
1004
- // clang-format off
1005
- for (const auto & m : params.model)
1006
- for (const auto & nl : params.n_gpu_layers)
1007
- for (const auto & rpc : params.rpc_servers)
1008
- for (const auto & sm : params.split_mode)
1009
- for (const auto & mg : params.main_gpu)
1010
- for (const auto & ts : params.tensor_split)
1011
- for (const auto & ot : params.tensor_buft_overrides)
1012
- for (const auto & mmp : params.use_mmap)
1013
- for (const auto & embd : params.embeddings)
1014
- for (const auto & nopo : params.no_op_offload)
1015
- for (const auto & nb : params.n_batch)
1016
- for (const auto & nub : params.n_ubatch)
1017
- for (const auto & tk : params.type_k)
1018
- for (const auto & tv : params.type_v)
1019
- for (const auto & defrag_thold : params.defrag_thold)
1020
- for (const auto & nkvo : params.no_kv_offload)
1021
- for (const auto & fa : params.flash_attn)
1022
- for (const auto & nt : params.n_threads)
1023
- for (const auto & cm : params.cpu_mask)
1024
- for (const auto & cs : params.cpu_strict)
1025
- for (const auto & nd : params.n_depth)
1026
- for (const auto & pl : params.poll) {
1027
- for (const auto & n_prompt : params.n_prompt) {
1028
- if (n_prompt == 0) {
1029
- continue;
1030
- }
1031
- cmd_params_instance instance = {
1032
- /* .model = */ m,
1033
- /* .n_prompt = */ n_prompt,
1034
- /* .n_gen = */ 0,
1035
- /* .n_depth = */ nd,
1036
- /* .n_batch = */ nb,
1037
- /* .n_ubatch = */ nub,
1038
- /* .type_k = */ tk,
1039
- /* .type_v = */ tv,
1040
- /* .defrag_thold = */ defrag_thold,
1041
- /* .n_threads = */ nt,
1042
- /* .cpu_mask = */ cm,
1043
- /* .cpu_strict = */ cs,
1044
- /* .poll = */ pl,
1045
- /* .n_gpu_layers = */ nl,
1046
- /* .rpc_servers = */ rpc,
1047
- /* .split_mode = */ sm,
1048
- /* .main_gpu = */ mg,
1049
- /* .no_kv_offload= */ nkvo,
1050
- /* .flash_attn = */ fa,
1051
- /* .tensor_split = */ ts,
1052
- /* .tensor_buft_overrides = */ ot,
1053
- /* .use_mmap = */ mmp,
1054
- /* .embeddings = */ embd,
1055
- /* .no_op_offload= */ nopo,
1056
- };
1057
- instances.push_back(instance);
1058
- }
1059
-
1060
- for (const auto & n_gen : params.n_gen) {
1061
- if (n_gen == 0) {
1062
- continue;
1063
- }
1064
- cmd_params_instance instance = {
1065
- /* .model = */ m,
1066
- /* .n_prompt = */ 0,
1067
- /* .n_gen = */ n_gen,
1068
- /* .n_depth = */ nd,
1069
- /* .n_batch = */ nb,
1070
- /* .n_ubatch = */ nub,
1071
- /* .type_k = */ tk,
1072
- /* .type_v = */ tv,
1073
- /* .defrag_thold = */ defrag_thold,
1074
- /* .n_threads = */ nt,
1075
- /* .cpu_mask = */ cm,
1076
- /* .cpu_strict = */ cs,
1077
- /* .poll = */ pl,
1078
- /* .n_gpu_layers = */ nl,
1079
- /* .rpc_servers = */ rpc,
1080
- /* .split_mode = */ sm,
1081
- /* .main_gpu = */ mg,
1082
- /* .no_kv_offload= */ nkvo,
1083
- /* .flash_attn = */ fa,
1084
- /* .tensor_split = */ ts,
1085
- /* .tensor_buft_overrides = */ ot,
1086
- /* .use_mmap = */ mmp,
1087
- /* .embeddings = */ embd,
1088
- /* .no_op_offload= */ nopo,
1089
- };
1090
- instances.push_back(instance);
1091
- }
1092
-
1093
- for (const auto & n_pg : params.n_pg) {
1094
- if (n_pg.first == 0 && n_pg.second == 0) {
1095
- continue;
1096
- }
1097
- cmd_params_instance instance = {
1098
- /* .model = */ m,
1099
- /* .n_prompt = */ n_pg.first,
1100
- /* .n_gen = */ n_pg.second,
1101
- /* .n_depth = */ nd,
1102
- /* .n_batch = */ nb,
1103
- /* .n_ubatch = */ nub,
1104
- /* .type_k = */ tk,
1105
- /* .type_v = */ tv,
1106
- /* .defrag_thold = */ defrag_thold,
1107
- /* .n_threads = */ nt,
1108
- /* .cpu_mask = */ cm,
1109
- /* .cpu_strict = */ cs,
1110
- /* .poll = */ pl,
1111
- /* .n_gpu_layers = */ nl,
1112
- /* .rpc_servers = */ rpc,
1113
- /* .split_mode = */ sm,
1114
- /* .main_gpu = */ mg,
1115
- /* .no_kv_offload= */ nkvo,
1116
- /* .flash_attn = */ fa,
1117
- /* .tensor_split = */ ts,
1118
- /* .tensor_buft_overrides = */ ot,
1119
- /* .use_mmap = */ mmp,
1120
- /* .embeddings = */ embd,
1121
- /* .no_op_offload= */ nopo,
1122
- };
1123
- instances.push_back(instance);
1124
- }
1125
- }
1126
- // clang-format on
1127
-
1128
- return instances;
1129
- }
1130
-
1131
- struct test {
1132
- static const std::string build_commit;
1133
- static const int build_number;
1134
- const std::string cpu_info;
1135
- const std::string gpu_info;
1136
- std::string model_filename;
1137
- std::string model_type;
1138
- uint64_t model_size;
1139
- uint64_t model_n_params;
1140
- int n_batch;
1141
- int n_ubatch;
1142
- int n_threads;
1143
- std::string cpu_mask;
1144
- bool cpu_strict;
1145
- int poll;
1146
- ggml_type type_k;
1147
- ggml_type type_v;
1148
- float defrag_thold;
1149
- int n_gpu_layers;
1150
- llama_split_mode split_mode;
1151
- int main_gpu;
1152
- bool no_kv_offload;
1153
- bool flash_attn;
1154
- std::vector<float> tensor_split;
1155
- std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
1156
- bool use_mmap;
1157
- bool embeddings;
1158
- bool no_op_offload;
1159
- int n_prompt;
1160
- int n_gen;
1161
- int n_depth;
1162
- std::string test_time;
1163
- std::vector<uint64_t> samples_ns;
1164
-
1165
- test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
1166
- cpu_info(get_cpu_info()),
1167
- gpu_info(get_gpu_info()) {
1168
-
1169
- model_filename = inst.model;
1170
- char buf[128];
1171
- llama_model_desc(lmodel, buf, sizeof(buf));
1172
- model_type = buf;
1173
- model_size = llama_model_size(lmodel);
1174
- model_n_params = llama_model_n_params(lmodel);
1175
- n_batch = inst.n_batch;
1176
- n_ubatch = inst.n_ubatch;
1177
- n_threads = inst.n_threads;
1178
- cpu_mask = inst.cpu_mask;
1179
- cpu_strict = inst.cpu_strict;
1180
- poll = inst.poll;
1181
- type_k = inst.type_k;
1182
- type_v = inst.type_v;
1183
- defrag_thold = inst.defrag_thold;
1184
- n_gpu_layers = inst.n_gpu_layers;
1185
- split_mode = inst.split_mode;
1186
- main_gpu = inst.main_gpu;
1187
- no_kv_offload = inst.no_kv_offload;
1188
- flash_attn = inst.flash_attn;
1189
- tensor_split = inst.tensor_split;
1190
- tensor_buft_overrides = inst.tensor_buft_overrides;
1191
- use_mmap = inst.use_mmap;
1192
- embeddings = inst.embeddings;
1193
- no_op_offload = inst.no_op_offload;
1194
- n_prompt = inst.n_prompt;
1195
- n_gen = inst.n_gen;
1196
- n_depth = inst.n_depth;
1197
- // RFC 3339 date-time format
1198
- time_t t = time(NULL);
1199
- std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
1200
- test_time = buf;
1201
-
1202
- (void) ctx;
1203
- }
1204
-
1205
- uint64_t avg_ns() const { return ::avg(samples_ns); }
1206
-
1207
- uint64_t stdev_ns() const { return ::stdev(samples_ns); }
1208
-
1209
- std::vector<double> get_ts() const {
1210
- int n_tokens = n_prompt + n_gen;
1211
- std::vector<double> ts;
1212
- std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
1213
- [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
1214
- return ts;
1215
- }
1216
-
1217
- double avg_ts() const { return ::avg(get_ts()); }
1218
-
1219
- double stdev_ts() const { return ::stdev(get_ts()); }
1220
-
1221
- static std::string get_backend() {
1222
- std::vector<std::string> backends;
1223
- for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
1224
- auto * reg = ggml_backend_reg_get(i);
1225
- std::string name = ggml_backend_reg_name(reg);
1226
- if (name != "CPU") {
1227
- backends.push_back(ggml_backend_reg_name(reg));
1228
- }
1229
- }
1230
- return backends.empty() ? "CPU" : join(backends, ",");
1231
- }
1232
-
1233
- static const std::vector<std::string> & get_fields() {
1234
- static const std::vector<std::string> fields = {
1235
- "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
1236
- "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
1237
- "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
1238
- "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1239
- "defrag_thold",
1240
- "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
1241
- "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
1242
- };
1243
- return fields;
1244
- }
1245
-
1246
- enum field_type { STRING, BOOL, INT, FLOAT };
1247
-
1248
- static field_type get_field_type(const std::string & field) {
1249
- if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
1250
- field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
1251
- field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
1252
- field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
1253
- return INT;
1254
- }
1255
- if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
1256
- field == "use_mmap" || field == "embeddings") {
1257
- return BOOL;
1258
- }
1259
- if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
1260
- return FLOAT;
1261
- }
1262
- return STRING;
1263
- }
1264
-
1265
- std::vector<std::string> get_values() const {
1266
- std::string tensor_split_str;
1267
- std::string tensor_buft_overrides_str;
1268
- int max_nonzero = 0;
1269
- for (size_t i = 0; i < llama_max_devices(); i++) {
1270
- if (tensor_split[i] > 0) {
1271
- max_nonzero = i;
1272
- }
1273
- }
1274
- for (int i = 0; i <= max_nonzero; i++) {
1275
- char buf[32];
1276
- snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
1277
- tensor_split_str += buf;
1278
- if (i < max_nonzero) {
1279
- tensor_split_str += "/";
1280
- }
1281
- }
1282
- if (tensor_buft_overrides.size() == 1) {
1283
- // Last element of tensor_buft_overrides is always a null pattern
1284
- // so if it is only one element long, it must be a null pattern.
1285
- GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
1286
- tensor_buft_overrides_str += "none";
1287
- } else {
1288
- for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
1289
- // Last element of tensor_buft_overrides is always a null pattern
1290
- if (tensor_buft_overrides[i].pattern == nullptr) {
1291
- tensor_buft_overrides_str += "none";
1292
- } else {
1293
- tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
1294
- tensor_buft_overrides_str += "=";
1295
- tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
1296
- }
1297
- if (i + 2 < tensor_buft_overrides.size()) {
1298
- tensor_buft_overrides_str += ";";
1299
- }
1300
- }
1301
- }
1302
- std::vector<std::string> values = { build_commit,
1303
- std::to_string(build_number),
1304
- cpu_info,
1305
- gpu_info,
1306
- get_backend(),
1307
- model_filename,
1308
- model_type,
1309
- std::to_string(model_size),
1310
- std::to_string(model_n_params),
1311
- std::to_string(n_batch),
1312
- std::to_string(n_ubatch),
1313
- std::to_string(n_threads),
1314
- cpu_mask,
1315
- std::to_string(cpu_strict),
1316
- std::to_string(poll),
1317
- ggml_type_name(type_k),
1318
- ggml_type_name(type_v),
1319
- std::to_string(n_gpu_layers),
1320
- split_mode_str(split_mode),
1321
- std::to_string(main_gpu),
1322
- std::to_string(no_kv_offload),
1323
- std::to_string(flash_attn),
1324
- tensor_split_str,
1325
- tensor_buft_overrides_str,
1326
- std::to_string(defrag_thold),
1327
- std::to_string(use_mmap),
1328
- std::to_string(embeddings),
1329
- std::to_string(no_op_offload),
1330
- std::to_string(n_prompt),
1331
- std::to_string(n_gen),
1332
- std::to_string(n_depth),
1333
- test_time,
1334
- std::to_string(avg_ns()),
1335
- std::to_string(stdev_ns()),
1336
- std::to_string(avg_ts()),
1337
- std::to_string(stdev_ts()) };
1338
- return values;
1339
- }
1340
-
1341
- std::map<std::string, std::string> get_map() const {
1342
- std::map<std::string, std::string> map;
1343
- auto fields = get_fields();
1344
- auto values = get_values();
1345
- std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
1346
- std::make_pair<const std::string &, const std::string &>);
1347
- return map;
1348
- }
1349
- };
1350
-
1351
- const std::string test::build_commit = LLAMA_COMMIT;
1352
- const int test::build_number = LLAMA_BUILD_NUMBER;
1353
-
1354
- struct printer {
1355
- virtual ~printer() {}
1356
-
1357
- FILE * fout;
1358
-
1359
- virtual void print_header(const cmd_params & params) { (void) params; }
1360
-
1361
- virtual void print_test(const test & t) = 0;
1362
-
1363
- virtual void print_footer() {}
1364
- };
1365
-
1366
- struct csv_printer : public printer {
1367
- static std::string escape_csv(const std::string & field) {
1368
- std::string escaped = "\"";
1369
- for (auto c : field) {
1370
- if (c == '"') {
1371
- escaped += "\"";
1372
- }
1373
- escaped += c;
1374
- }
1375
- escaped += "\"";
1376
- return escaped;
1377
- }
1378
-
1379
- void print_header(const cmd_params & params) override {
1380
- std::vector<std::string> fields = test::get_fields();
1381
- fprintf(fout, "%s\n", join(fields, ",").c_str());
1382
- (void) params;
1383
- }
1384
-
1385
- void print_test(const test & t) override {
1386
- std::vector<std::string> values = t.get_values();
1387
- std::transform(values.begin(), values.end(), values.begin(), escape_csv);
1388
- fprintf(fout, "%s\n", join(values, ",").c_str());
1389
- }
1390
- };
1391
-
1392
- static std::string escape_json(const std::string & value) {
1393
- std::string escaped;
1394
- for (auto c : value) {
1395
- if (c == '"') {
1396
- escaped += "\\\"";
1397
- } else if (c == '\\') {
1398
- escaped += "\\\\";
1399
- } else if (c <= 0x1f) {
1400
- char buf[8];
1401
- snprintf(buf, sizeof(buf), "\\u%04x", c);
1402
- escaped += buf;
1403
- } else {
1404
- escaped += c;
1405
- }
1406
- }
1407
- return escaped;
1408
- }
1409
-
1410
- static std::string format_json_value(const std::string & field, const std::string & value) {
1411
- switch (test::get_field_type(field)) {
1412
- case test::STRING:
1413
- return "\"" + escape_json(value) + "\"";
1414
- case test::BOOL:
1415
- return value == "0" ? "false" : "true";
1416
- default:
1417
- return value;
1418
- }
1419
- }
1420
-
1421
- struct json_printer : public printer {
1422
- bool first = true;
1423
-
1424
- void print_header(const cmd_params & params) override {
1425
- fprintf(fout, "[\n");
1426
- (void) params;
1427
- }
1428
-
1429
- void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1430
- assert(fields.size() == values.size());
1431
- for (size_t i = 0; i < fields.size(); i++) {
1432
- fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
1433
- format_json_value(fields.at(i), values.at(i)).c_str());
1434
- }
1435
- }
1436
-
1437
- void print_test(const test & t) override {
1438
- if (first) {
1439
- first = false;
1440
- } else {
1441
- fprintf(fout, ",\n");
1442
- }
1443
- fprintf(fout, " {\n");
1444
- print_fields(test::get_fields(), t.get_values());
1445
- fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
1446
- fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
1447
- fprintf(fout, " }");
1448
- fflush(fout);
1449
- }
1450
-
1451
- void print_footer() override { fprintf(fout, "\n]\n"); }
1452
- };
1453
-
1454
- struct jsonl_printer : public printer {
1455
- void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1456
- assert(fields.size() == values.size());
1457
- for (size_t i = 0; i < fields.size(); i++) {
1458
- fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1459
- }
1460
- }
1461
-
1462
- void print_test(const test & t) override {
1463
- fprintf(fout, "{");
1464
- print_fields(test::get_fields(), t.get_values());
1465
- fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
1466
- fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
1467
- fprintf(fout, "}\n");
1468
- fflush(fout);
1469
- }
1470
- };
1471
-
1472
- struct markdown_printer : public printer {
1473
- std::vector<std::string> fields;
1474
-
1475
- static int get_field_width(const std::string & field) {
1476
- if (field == "model") {
1477
- return -30;
1478
- }
1479
- if (field == "t/s") {
1480
- return 20;
1481
- }
1482
- if (field == "size" || field == "params") {
1483
- return 10;
1484
- }
1485
- if (field == "n_gpu_layers") {
1486
- return 3;
1487
- }
1488
- if (field == "n_threads") {
1489
- return 7;
1490
- }
1491
- if (field == "n_batch") {
1492
- return 7;
1493
- }
1494
- if (field == "n_ubatch") {
1495
- return 8;
1496
- }
1497
- if (field == "type_k" || field == "type_v") {
1498
- return 6;
1499
- }
1500
- if (field == "split_mode") {
1501
- return 5;
1502
- }
1503
- if (field == "flash_attn") {
1504
- return 2;
1505
- }
1506
- if (field == "use_mmap") {
1507
- return 4;
1508
- }
1509
- if (field == "test") {
1510
- return 15;
1511
- }
1512
- if (field == "no_op_offload") {
1513
- return 4;
1514
- }
1515
-
1516
- int width = std::max((int) field.length(), 10);
1517
-
1518
- if (test::get_field_type(field) == test::STRING) {
1519
- return -width;
1520
- }
1521
- return width;
1522
- }
1523
-
1524
- static std::string get_field_display_name(const std::string & field) {
1525
- if (field == "n_gpu_layers") {
1526
- return "ngl";
1527
- }
1528
- if (field == "split_mode") {
1529
- return "sm";
1530
- }
1531
- if (field == "n_threads") {
1532
- return "threads";
1533
- }
1534
- if (field == "no_kv_offload") {
1535
- return "nkvo";
1536
- }
1537
- if (field == "flash_attn") {
1538
- return "fa";
1539
- }
1540
- if (field == "use_mmap") {
1541
- return "mmap";
1542
- }
1543
- if (field == "embeddings") {
1544
- return "embd";
1545
- }
1546
- if (field == "no_op_offload") {
1547
- return "nopo";
1548
- }
1549
- if (field == "tensor_split") {
1550
- return "ts";
1551
- }
1552
- if (field == "tensor_buft_overrides") {
1553
- return "ot";
1554
- }
1555
- return field;
1556
- }
1557
-
1558
- void print_header(const cmd_params & params) override {
1559
- // select fields to print
1560
- fields.emplace_back("model");
1561
- fields.emplace_back("size");
1562
- fields.emplace_back("params");
1563
- fields.emplace_back("backend");
1564
- bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
1565
- test::get_backend().find("BLAS") != std::string::npos;
1566
- if (!is_cpu_backend) {
1567
- fields.emplace_back("n_gpu_layers");
1568
- }
1569
- if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1570
- fields.emplace_back("n_threads");
1571
- }
1572
- if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
1573
- fields.emplace_back("cpu_mask");
1574
- }
1575
- if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
1576
- fields.emplace_back("cpu_strict");
1577
- }
1578
- if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
1579
- fields.emplace_back("poll");
1580
- }
1581
- if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
1582
- fields.emplace_back("n_batch");
1583
- }
1584
- if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
1585
- fields.emplace_back("n_ubatch");
1586
- }
1587
- if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
1588
- fields.emplace_back("type_k");
1589
- }
1590
- if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
1591
- fields.emplace_back("type_v");
1592
- }
1593
- if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
1594
- fields.emplace_back("defrag_thold");
1595
- }
1596
- if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
1597
- fields.emplace_back("main_gpu");
1598
- }
1599
- if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
1600
- fields.emplace_back("split_mode");
1601
- }
1602
- if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
1603
- fields.emplace_back("no_kv_offload");
1604
- }
1605
- if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
1606
- fields.emplace_back("flash_attn");
1607
- }
1608
- if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1609
- fields.emplace_back("tensor_split");
1610
- }
1611
- if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
1612
- fields.emplace_back("tensor_buft_overrides");
1613
- }
1614
- if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
1615
- fields.emplace_back("use_mmap");
1616
- }
1617
- if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
1618
- fields.emplace_back("embeddings");
1619
- }
1620
- if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
1621
- fields.emplace_back("no_op_offload");
1622
- }
1623
- fields.emplace_back("test");
1624
- fields.emplace_back("t/s");
1625
-
1626
- fprintf(fout, "|");
1627
- for (const auto & field : fields) {
1628
- fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
1629
- }
1630
- fprintf(fout, "\n");
1631
- fprintf(fout, "|");
1632
- for (const auto & field : fields) {
1633
- int width = get_field_width(field);
1634
- fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
1635
- }
1636
- fprintf(fout, "\n");
1637
- }
1638
-
1639
- void print_test(const test & t) override {
1640
- std::map<std::string, std::string> vmap = t.get_map();
1641
-
1642
- fprintf(fout, "|");
1643
- for (const auto & field : fields) {
1644
- std::string value;
1645
- char buf[128];
1646
- if (field == "model") {
1647
- value = t.model_type;
1648
- } else if (field == "size") {
1649
- if (t.model_size < 1024 * 1024 * 1024) {
1650
- snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
1651
- } else {
1652
- snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
1653
- }
1654
- value = buf;
1655
- } else if (field == "params") {
1656
- if (t.model_n_params < 1000 * 1000 * 1000) {
1657
- snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
1658
- } else {
1659
- snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
1660
- }
1661
- value = buf;
1662
- } else if (field == "backend") {
1663
- value = test::get_backend();
1664
- } else if (field == "test") {
1665
- if (t.n_prompt > 0 && t.n_gen == 0) {
1666
- snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1667
- } else if (t.n_gen > 0 && t.n_prompt == 0) {
1668
- snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1669
- } else {
1670
- snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1671
- }
1672
- if (t.n_depth > 0) {
1673
- int len = strlen(buf);
1674
- snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
1675
- }
1676
- value = buf;
1677
- } else if (field == "t/s") {
1678
- snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
1679
- value = buf;
1680
- } else if (vmap.find(field) != vmap.end()) {
1681
- value = vmap.at(field);
1682
- } else {
1683
- assert(false);
1684
- exit(1);
1685
- }
1686
-
1687
- int width = get_field_width(field);
1688
- if (field == "t/s") {
1689
- // HACK: the utf-8 character is 2 bytes
1690
- width += 1;
1691
- }
1692
- fprintf(fout, " %*s |", width, value.c_str());
1693
- }
1694
- fprintf(fout, "\n");
1695
- }
1696
-
1697
- void print_footer() override {
1698
- fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
1699
- }
1700
- };
1701
-
1702
- struct sql_printer : public printer {
1703
- static std::string get_sql_field_type(const std::string & field) {
1704
- switch (test::get_field_type(field)) {
1705
- case test::STRING:
1706
- return "TEXT";
1707
- case test::BOOL:
1708
- case test::INT:
1709
- return "INTEGER";
1710
- case test::FLOAT:
1711
- return "REAL";
1712
- default:
1713
- assert(false);
1714
- exit(1);
1715
- }
1716
- }
1717
-
1718
- void print_header(const cmd_params & params) override {
1719
- std::vector<std::string> fields = test::get_fields();
1720
- fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
1721
- for (size_t i = 0; i < fields.size(); i++) {
1722
- fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
1723
- i < fields.size() - 1 ? "," : "");
1724
- }
1725
- fprintf(fout, ");\n");
1726
- fprintf(fout, "\n");
1727
- (void) params;
1728
- }
1729
-
1730
- void print_test(const test & t) override {
1731
- fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
1732
- fprintf(fout, "VALUES (");
1733
- std::vector<std::string> values = t.get_values();
1734
- for (size_t i = 0; i < values.size(); i++) {
1735
- fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
1736
- }
1737
- fprintf(fout, ");\n");
1738
- }
1739
- };
1740
-
1741
- static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1742
- llama_set_n_threads(ctx, n_threads, n_threads);
1743
-
1744
- const llama_model * model = llama_get_model(ctx);
1745
- const llama_vocab * vocab = llama_model_get_vocab(model);
1746
- const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1747
-
1748
- std::vector<llama_token> tokens(n_batch);
1749
-
1750
- int n_processed = 0;
1751
-
1752
- while (n_processed < n_prompt) {
1753
- int n_tokens = std::min(n_prompt - n_processed, n_batch);
1754
- tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1755
- for (int i = 1; i < n_tokens; i++) {
1756
- tokens[i] = std::rand() % n_vocab;
1757
- }
1758
- int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1759
- if (res != 0) {
1760
- fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
1761
- return false;
1762
- }
1763
- n_processed += n_tokens;
1764
- }
1765
-
1766
- llama_synchronize(ctx);
1767
- return true;
1768
- }
1769
-
1770
- static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
1771
- llama_set_n_threads(ctx, n_threads, n_threads);
1772
-
1773
- const llama_model * model = llama_get_model(ctx);
1774
- const llama_vocab * vocab = llama_model_get_vocab(model);
1775
- const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1776
-
1777
- llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1778
-
1779
- for (int i = 0; i < n_gen; i++) {
1780
- int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
1781
- if (res != 0) {
1782
- fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
1783
- return false;
1784
- }
1785
- llama_synchronize(ctx);
1786
- token = std::rand() % n_vocab;
1787
- }
1788
- return true;
1789
- }
1790
-
1791
- static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
1792
- (void) level;
1793
- (void) text;
1794
- (void) user_data;
1795
- }
1796
-
1797
- static std::unique_ptr<printer> create_printer(output_formats format) {
1798
- switch (format) {
1799
- case NONE:
1800
- return nullptr;
1801
- case CSV:
1802
- return std::unique_ptr<printer>(new csv_printer());
1803
- case JSON:
1804
- return std::unique_ptr<printer>(new json_printer());
1805
- case JSONL:
1806
- return std::unique_ptr<printer>(new jsonl_printer());
1807
- case MARKDOWN:
1808
- return std::unique_ptr<printer>(new markdown_printer());
1809
- case SQL:
1810
- return std::unique_ptr<printer>(new sql_printer());
1811
- }
1812
- GGML_ABORT("fatal error");
1813
- }
1814
-
1815
- int main(int argc, char ** argv) {
1816
- // try to set locale for unicode characters in markdown
1817
- setlocale(LC_CTYPE, ".UTF-8");
1818
-
1819
- #if !defined(NDEBUG)
1820
- fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
1821
- #endif
1822
-
1823
- #if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
1824
- fprintf(stderr, "warning: debug build, performance may be affected\n");
1825
- #endif
1826
-
1827
- #if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
1828
- fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
1829
- #endif
1830
-
1831
- // initialize backends
1832
- ggml_backend_load_all();
1833
-
1834
- cmd_params params = parse_cmd_params(argc, argv);
1835
-
1836
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1837
- if (!cpu_dev) {
1838
- fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
1839
- return 1;
1840
- }
1841
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
1842
- auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
1843
- auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
1844
-
1845
- // initialize llama.cpp
1846
- if (!params.verbose) {
1847
- llama_log_set(llama_null_log_callback, NULL);
1848
- }
1849
- llama_backend_init();
1850
- llama_numa_init(params.numa);
1851
-
1852
- set_process_priority(params.prio);
1853
-
1854
- // initialize printer
1855
- std::unique_ptr<printer> p = create_printer(params.output_format);
1856
- std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
1857
-
1858
- if (p) {
1859
- p->fout = stdout;
1860
- p->print_header(params);
1861
- }
1862
-
1863
- if (p_err) {
1864
- p_err->fout = stderr;
1865
- p_err->print_header(params);
1866
- }
1867
-
1868
- std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
1869
-
1870
- llama_model * lmodel = nullptr;
1871
- const cmd_params_instance * prev_inst = nullptr;
1872
-
1873
- int params_idx = 0;
1874
- auto params_count = params_instances.size();
1875
- for (const auto & inst : params_instances) {
1876
- params_idx++;
1877
- if (params.progress) {
1878
- fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
1879
- }
1880
- // keep the same model between tests when possible
1881
- if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1882
- if (lmodel) {
1883
- llama_model_free(lmodel);
1884
- }
1885
-
1886
- lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
1887
- if (lmodel == NULL) {
1888
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
1889
- return 1;
1890
- }
1891
- prev_inst = &inst;
1892
- }
1893
-
1894
- llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
1895
- if (ctx == NULL) {
1896
- fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
1897
- llama_model_free(lmodel);
1898
- return 1;
1899
- }
1900
-
1901
- test t(inst, lmodel, ctx);
1902
-
1903
- llama_kv_self_clear(ctx);
1904
-
1905
- // cool off before the test
1906
- if (params.delay) {
1907
- std::this_thread::sleep_for(std::chrono::seconds(params.delay));
1908
- }
1909
-
1910
- struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
1911
- if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
1912
- fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
1913
- exit(1);
1914
- }
1915
- tpp.strict_cpu = t.cpu_strict;
1916
- tpp.poll = t.poll;
1917
- tpp.prio = params.prio;
1918
-
1919
- struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
1920
- if (!threadpool) {
1921
- fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1922
- exit(1);
1923
- }
1924
-
1925
- llama_attach_threadpool(ctx, threadpool, NULL);
1926
-
1927
- // warmup run
1928
- if (t.n_prompt > 0) {
1929
- if (params.progress) {
1930
- fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
1931
- }
1932
- //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1933
- bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1934
- if (!res) {
1935
- fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
1936
- exit(1);
1937
- }
1938
- }
1939
- if (t.n_gen > 0) {
1940
- if (params.progress) {
1941
- fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
1942
- }
1943
- bool res = test_gen(ctx, 1, t.n_threads);
1944
- if (!res) {
1945
- fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
1946
- exit(1);
1947
- }
1948
- }
1949
-
1950
- for (int i = 0; i < params.reps; i++) {
1951
- llama_kv_self_clear(ctx);
1952
-
1953
- if (t.n_depth > 0) {
1954
- if (params.progress) {
1955
- fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
1956
- i + 1, params.reps);
1957
- }
1958
- bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
1959
- if (!res) {
1960
- fprintf(stderr, "%s: error: failed to run depth\n", __func__);
1961
- exit(1);
1962
- }
1963
- }
1964
-
1965
- uint64_t t_start = get_time_ns();
1966
-
1967
- if (t.n_prompt > 0) {
1968
- if (params.progress) {
1969
- fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
1970
- i + 1, params.reps);
1971
- }
1972
- bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1973
- if (!res) {
1974
- fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
1975
- exit(1);
1976
- }
1977
- }
1978
- if (t.n_gen > 0) {
1979
- if (params.progress) {
1980
- fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
1981
- i + 1, params.reps);
1982
- }
1983
- bool res = test_gen(ctx, t.n_gen, t.n_threads);
1984
- if (!res) {
1985
- fprintf(stderr, "%s: error: failed to run gen\n", __func__);
1986
- exit(1);
1987
- }
1988
- }
1989
-
1990
- uint64_t t_ns = get_time_ns() - t_start;
1991
- t.samples_ns.push_back(t_ns);
1992
- }
1993
-
1994
- if (p) {
1995
- p->print_test(t);
1996
- fflush(p->fout);
1997
- }
1998
-
1999
- if (p_err) {
2000
- p_err->print_test(t);
2001
- fflush(p_err->fout);
2002
- }
2003
-
2004
- llama_perf_context_print(ctx);
2005
-
2006
- llama_free(ctx);
2007
-
2008
- ggml_threadpool_free_fn(threadpool);
2009
- }
2010
-
2011
- llama_model_free(lmodel);
2012
-
2013
- if (p) {
2014
- p->print_footer();
2015
- }
2016
-
2017
- if (p_err) {
2018
- p_err->print_footer();
2019
- }
2020
-
2021
- llama_backend_free();
2022
-
2023
- return 0;
2024
- }