@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
36
36
  return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
37
37
  }
38
38
 
39
+ static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
40
+ if (a.pattern != b.pattern) {
41
+ // cString comparison that may be null
42
+ if (a.pattern == nullptr || b.pattern == nullptr) {
43
+ return false;
44
+ }
45
+ if (strcmp(a.pattern, b.pattern) != 0) {
46
+ return false;
47
+ }
48
+ }
49
+ if (a.buft != b.buft) {
50
+ return false;
51
+ }
52
+ return true;
53
+ }
54
+
55
+ static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
56
+ if (a.size() != b.size()) {
57
+ return false;
58
+ }
59
+ for (size_t i = 0; i < a.size(); i++) {
60
+ if (!tensor_buft_override_equal(a[i], b[i])) {
61
+ return false;
62
+ }
63
+ }
64
+ return true;
65
+ }
66
+
67
+ static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
68
+ if (a.size() != b.size()) {
69
+ return false;
70
+ }
71
+ for (size_t i = 0; i < a.size(); i++) {
72
+ if (!vec_tensor_buft_override_equal(a[i], b[i])) {
73
+ return false;
74
+ }
75
+ }
76
+ return true;
77
+ }
78
+
39
79
  template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
40
80
  std::ostringstream str;
41
81
  for (size_t i = 0; i < values.size(); i++) {
@@ -155,15 +195,57 @@ static std::string pair_str(const std::pair<int, int> & p) {
155
195
  return buf;
156
196
  }
157
197
 
198
+ static std::vector<int> parse_int_range(const std::string & s) {
199
+ // first[-last[(+|*)step]]
200
+ std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
201
+
202
+ std::smatch match;
203
+ std::string::const_iterator search_start(s.cbegin());
204
+ std::vector<int> result;
205
+ while (std::regex_search(search_start, s.cend(), match, range_regex)) {
206
+ int first = std::stoi(match[1]);
207
+ int last = match[2].matched ? std::stoi(match[2]) : first;
208
+ char op = match[3].matched ? match[3].str()[0] : '+';
209
+ int step = match[4].matched ? std::stoi(match[4]) : 1;
210
+
211
+ for (int i = first; i <= last;) {
212
+ result.push_back(i);
213
+
214
+ int prev_i = i;
215
+
216
+ if (op == '+') {
217
+ i += step;
218
+ } else if (op == '*') {
219
+ i *= step;
220
+ } else {
221
+ throw std::invalid_argument("invalid range format");
222
+ }
223
+
224
+ if (i <= prev_i) {
225
+ throw std::invalid_argument("invalid range");
226
+ }
227
+ }
228
+ search_start = match.suffix().first;
229
+ }
230
+
231
+ if (search_start != s.cend()) {
232
+ throw std::invalid_argument("invalid range format");
233
+ }
234
+
235
+ return result;
236
+ }
237
+
158
238
  struct cmd_params {
159
239
  std::vector<std::string> model;
160
240
  std::vector<int> n_prompt;
161
241
  std::vector<int> n_gen;
162
242
  std::vector<std::pair<int, int>> n_pg;
243
+ std::vector<int> n_depth;
163
244
  std::vector<int> n_batch;
164
245
  std::vector<int> n_ubatch;
165
246
  std::vector<ggml_type> type_k;
166
247
  std::vector<ggml_type> type_v;
248
+ std::vector<float> defrag_thold;
167
249
  std::vector<int> n_threads;
168
250
  std::vector<std::string> cpu_mask;
169
251
  std::vector<bool> cpu_strict;
@@ -175,8 +257,10 @@ struct cmd_params {
175
257
  std::vector<bool> no_kv_offload;
176
258
  std::vector<bool> flash_attn;
177
259
  std::vector<std::vector<float>> tensor_split;
260
+ std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
178
261
  std::vector<bool> use_mmap;
179
262
  std::vector<bool> embeddings;
263
+ std::vector<bool> no_op_offload;
180
264
  ggml_numa_strategy numa;
181
265
  int reps;
182
266
  ggml_sched_priority prio;
@@ -192,10 +276,12 @@ static const cmd_params cmd_params_defaults = {
192
276
  /* n_prompt */ { 512 },
193
277
  /* n_gen */ { 128 },
194
278
  /* n_pg */ {},
279
+ /* n_depth */ { 0 },
195
280
  /* n_batch */ { 2048 },
196
281
  /* n_ubatch */ { 512 },
197
282
  /* type_k */ { GGML_TYPE_F16 },
198
283
  /* type_v */ { GGML_TYPE_F16 },
284
+ /* defrag_thold */ { -1.0f },
199
285
  /* n_threads */ { cpu_get_num_math() },
200
286
  /* cpu_mask */ { "0x0" },
201
287
  /* cpu_strict */ { false },
@@ -207,8 +293,10 @@ static const cmd_params cmd_params_defaults = {
207
293
  /* no_kv_offload */ { false },
208
294
  /* flash_attn */ { false },
209
295
  /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
296
+ /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
210
297
  /* use_mmap */ { true },
211
298
  /* embeddings */ { false },
299
+ /* no_op_offload */ { false },
212
300
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
213
301
  /* reps */ 5,
214
302
  /* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -224,12 +312,29 @@ static void print_usage(int /* argc */, char ** argv) {
224
312
  printf("\n");
225
313
  printf("options:\n");
226
314
  printf(" -h, --help\n");
315
+ printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
316
+ printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
317
+ cmd_params_defaults.reps);
318
+ printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
319
+ cmd_params_defaults.prio);
320
+ printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
321
+ cmd_params_defaults.delay);
322
+ printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
323
+ output_format_str(cmd_params_defaults.output_format));
324
+ printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
325
+ output_format_str(cmd_params_defaults.output_format_stderr));
326
+ printf(" -v, --verbose verbose output\n");
327
+ printf(" --progress print test progress indicators\n");
328
+ printf("\n");
329
+ printf("test parameters:\n");
227
330
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
228
331
  printf(" -p, --n-prompt <n> (default: %s)\n",
229
332
  join(cmd_params_defaults.n_prompt, ",").c_str());
230
333
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
231
334
  printf(" -pg <pp,tg> (default: %s)\n",
232
335
  join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
336
+ printf(" -d, --n-depth <n> (default: %s)\n",
337
+ join(cmd_params_defaults.n_depth, ",").c_str());
233
338
  printf(" -b, --batch-size <n> (default: %s)\n",
234
339
  join(cmd_params_defaults.n_batch, ",").c_str());
235
340
  printf(" -ub, --ubatch-size <n> (default: %s)\n",
@@ -238,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
238
343
  join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
239
344
  printf(" -ctv, --cache-type-v <t> (default: %s)\n",
240
345
  join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
346
+ printf(" -dt, --defrag-thold <f> (default: %s)\n",
347
+ join(cmd_params_defaults.defrag_thold, ",").c_str());
241
348
  printf(" -t, --threads <n> (default: %s)\n",
242
349
  join(cmd_params_defaults.n_threads, ",").c_str());
243
350
  printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
@@ -261,23 +368,17 @@ static void print_usage(int /* argc */, char ** argv) {
261
368
  join(cmd_params_defaults.flash_attn, ",").c_str());
262
369
  printf(" -mmp, --mmap <0|1> (default: %s)\n",
263
370
  join(cmd_params_defaults.use_mmap, ",").c_str());
264
- printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
265
371
  printf(" -embd, --embeddings <0|1> (default: %s)\n",
266
372
  join(cmd_params_defaults.embeddings, ",").c_str());
267
373
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
268
- printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
269
- printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
270
- printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
271
- printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
272
- output_format_str(cmd_params_defaults.output_format));
273
- printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
274
- output_format_str(cmd_params_defaults.output_format_stderr));
275
- printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
276
- printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
374
+ printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
375
+ printf(" (default: disabled)\n");
376
+ printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
277
377
  printf("\n");
278
378
  printf(
279
- "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
280
- "multiple times.\n");
379
+ "Multiple values can be given for each parameter by separating them with ','\n"
380
+ "or by specifying the parameter multiple times. Ranges can be given as\n"
381
+ "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
281
382
  }
282
383
 
283
384
  static ggml_type ggml_type_from_name(const std::string & s) {
@@ -331,179 +432,197 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
331
432
  std::replace(arg.begin(), arg.end(), '_', '-');
332
433
  }
333
434
 
334
- if (arg == "-h" || arg == "--help") {
335
- print_usage(argc, argv);
336
- exit(0);
337
- } else if (arg == "-m" || arg == "--model") {
338
- if (++i >= argc) {
339
- invalid_param = true;
340
- break;
341
- }
342
- auto p = string_split<std::string>(argv[i], split_delim);
343
- params.model.insert(params.model.end(), p.begin(), p.end());
344
- } else if (arg == "-p" || arg == "--n-prompt") {
345
- if (++i >= argc) {
346
- invalid_param = true;
347
- break;
348
- }
349
- auto p = string_split<int>(argv[i], split_delim);
350
- params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
351
- } else if (arg == "-n" || arg == "--n-gen") {
352
- if (++i >= argc) {
353
- invalid_param = true;
354
- break;
355
- }
356
- auto p = string_split<int>(argv[i], split_delim);
357
- params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
358
- } else if (arg == "-pg") {
359
- if (++i >= argc) {
360
- invalid_param = true;
361
- break;
362
- }
363
- auto p = string_split<std::string>(argv[i], ',');
364
- if (p.size() != 2) {
365
- invalid_param = true;
366
- break;
367
- }
368
- params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
369
- } else if (arg == "-b" || arg == "--batch-size") {
370
- if (++i >= argc) {
371
- invalid_param = true;
372
- break;
373
- }
374
- auto p = string_split<int>(argv[i], split_delim);
375
- params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
376
- } else if (arg == "-ub" || arg == "--ubatch-size") {
377
- if (++i >= argc) {
378
- invalid_param = true;
379
- break;
380
- }
381
- auto p = string_split<int>(argv[i], split_delim);
382
- params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
383
- } else if (arg == "-ctk" || arg == "--cache-type-k") {
384
- if (++i >= argc) {
385
- invalid_param = true;
386
- break;
387
- }
388
- auto p = string_split<std::string>(argv[i], split_delim);
389
- std::vector<ggml_type> types;
390
- for (const auto & t : p) {
391
- ggml_type gt = ggml_type_from_name(t);
392
- if (gt == GGML_TYPE_COUNT) {
435
+ try {
436
+ if (arg == "-h" || arg == "--help") {
437
+ print_usage(argc, argv);
438
+ exit(0);
439
+ } else if (arg == "-m" || arg == "--model") {
440
+ if (++i >= argc) {
393
441
  invalid_param = true;
394
442
  break;
395
443
  }
396
- types.push_back(gt);
397
- }
398
- if (invalid_param) {
399
- break;
400
- }
401
- params.type_k.insert(params.type_k.end(), types.begin(), types.end());
402
- } else if (arg == "-ctv" || arg == "--cache-type-v") {
403
- if (++i >= argc) {
404
- invalid_param = true;
405
- break;
406
- }
407
- auto p = string_split<std::string>(argv[i], split_delim);
408
- std::vector<ggml_type> types;
409
- for (const auto & t : p) {
410
- ggml_type gt = ggml_type_from_name(t);
411
- if (gt == GGML_TYPE_COUNT) {
444
+ auto p = string_split<std::string>(argv[i], split_delim);
445
+ params.model.insert(params.model.end(), p.begin(), p.end());
446
+ } else if (arg == "-p" || arg == "--n-prompt") {
447
+ if (++i >= argc) {
412
448
  invalid_param = true;
413
449
  break;
414
450
  }
415
- types.push_back(gt);
416
- }
417
- if (invalid_param) {
418
- break;
419
- }
420
- params.type_v.insert(params.type_v.end(), types.begin(), types.end());
421
- } else if (arg == "-t" || arg == "--threads") {
422
- if (++i >= argc) {
423
- invalid_param = true;
424
- break;
425
- }
426
- auto p = string_split<int>(argv[i], split_delim);
427
- params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
428
- } else if (arg == "-C" || arg == "--cpu-mask") {
429
- if (++i >= argc) {
430
- invalid_param = true;
431
- break;
432
- }
433
- auto p = string_split<std::string>(argv[i], split_delim);
434
- params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
435
- } else if (arg == "--cpu-strict") {
436
- if (++i >= argc) {
437
- invalid_param = true;
438
- break;
439
- }
440
- auto p = string_split<bool>(argv[i], split_delim);
441
- params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
442
- } else if (arg == "--poll") {
443
- if (++i >= argc) {
444
- invalid_param = true;
445
- break;
446
- }
447
- auto p = string_split<int>(argv[i], split_delim);
448
- params.poll.insert(params.poll.end(), p.begin(), p.end());
449
- } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
450
- if (++i >= argc) {
451
- invalid_param = true;
452
- break;
453
- }
454
- auto p = string_split<int>(argv[i], split_delim);
455
- params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
456
- } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
457
- if (++i >= argc) {
458
- invalid_param = true;
459
- break;
460
- }
461
- params.rpc_servers.push_back(argv[i]);
462
- } else if (arg == "-sm" || arg == "--split-mode") {
463
- if (++i >= argc) {
464
- invalid_param = true;
465
- break;
466
- }
467
- auto p = string_split<std::string>(argv[i], split_delim);
468
- std::vector<llama_split_mode> modes;
469
- for (const auto & m : p) {
470
- llama_split_mode mode;
471
- if (m == "none") {
472
- mode = LLAMA_SPLIT_MODE_NONE;
473
- } else if (m == "layer") {
474
- mode = LLAMA_SPLIT_MODE_LAYER;
475
- } else if (m == "row") {
476
- mode = LLAMA_SPLIT_MODE_ROW;
477
- } else {
451
+ auto p = parse_int_range(argv[i]);
452
+ params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
453
+ } else if (arg == "-n" || arg == "--n-gen") {
454
+ if (++i >= argc) {
455
+ invalid_param = true;
456
+ break;
457
+ }
458
+ auto p = parse_int_range(argv[i]);
459
+ params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
460
+ } else if (arg == "-pg") {
461
+ if (++i >= argc) {
462
+ invalid_param = true;
463
+ break;
464
+ }
465
+ auto p = string_split<std::string>(argv[i], ',');
466
+ if (p.size() != 2) {
467
+ invalid_param = true;
468
+ break;
469
+ }
470
+ params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
471
+ } else if (arg == "-d" || arg == "--n-depth") {
472
+ if (++i >= argc) {
473
+ invalid_param = true;
474
+ break;
475
+ }
476
+ auto p = parse_int_range(argv[i]);
477
+ params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
478
+ } else if (arg == "-b" || arg == "--batch-size") {
479
+ if (++i >= argc) {
480
+ invalid_param = true;
481
+ break;
482
+ }
483
+ auto p = parse_int_range(argv[i]);
484
+ params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
485
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
486
+ if (++i >= argc) {
487
+ invalid_param = true;
488
+ break;
489
+ }
490
+ auto p = parse_int_range(argv[i]);
491
+ params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
492
+ } else if (arg == "-ctk" || arg == "--cache-type-k") {
493
+ if (++i >= argc) {
494
+ invalid_param = true;
495
+ break;
496
+ }
497
+ auto p = string_split<std::string>(argv[i], split_delim);
498
+
499
+ std::vector<ggml_type> types;
500
+ for (const auto & t : p) {
501
+ ggml_type gt = ggml_type_from_name(t);
502
+ if (gt == GGML_TYPE_COUNT) {
503
+ invalid_param = true;
504
+ break;
505
+ }
506
+ types.push_back(gt);
507
+ }
508
+ if (invalid_param) {
509
+ break;
510
+ }
511
+ params.type_k.insert(params.type_k.end(), types.begin(), types.end());
512
+ } else if (arg == "-ctv" || arg == "--cache-type-v") {
513
+ if (++i >= argc) {
514
+ invalid_param = true;
515
+ break;
516
+ }
517
+ auto p = string_split<std::string>(argv[i], split_delim);
518
+
519
+ std::vector<ggml_type> types;
520
+ for (const auto & t : p) {
521
+ ggml_type gt = ggml_type_from_name(t);
522
+ if (gt == GGML_TYPE_COUNT) {
523
+ invalid_param = true;
524
+ break;
525
+ }
526
+ types.push_back(gt);
527
+ }
528
+ if (invalid_param) {
529
+ break;
530
+ }
531
+ params.type_v.insert(params.type_v.end(), types.begin(), types.end());
532
+ } else if (arg == "-dt" || arg == "--defrag-thold") {
533
+ if (++i >= argc) {
534
+ invalid_param = true;
535
+ break;
536
+ }
537
+ auto p = string_split<float>(argv[i], split_delim);
538
+ params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
539
+ } else if (arg == "-t" || arg == "--threads") {
540
+ if (++i >= argc) {
541
+ invalid_param = true;
542
+ break;
543
+ }
544
+ auto p = parse_int_range(argv[i]);
545
+ params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
546
+ } else if (arg == "-C" || arg == "--cpu-mask") {
547
+ if (++i >= argc) {
548
+ invalid_param = true;
549
+ break;
550
+ }
551
+ auto p = string_split<std::string>(argv[i], split_delim);
552
+ params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
553
+ } else if (arg == "--cpu-strict") {
554
+ if (++i >= argc) {
555
+ invalid_param = true;
556
+ break;
557
+ }
558
+ auto p = string_split<bool>(argv[i], split_delim);
559
+ params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
560
+ } else if (arg == "--poll") {
561
+ if (++i >= argc) {
562
+ invalid_param = true;
563
+ break;
564
+ }
565
+ auto p = parse_int_range(argv[i]);
566
+ params.poll.insert(params.poll.end(), p.begin(), p.end());
567
+ } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
568
+ if (++i >= argc) {
569
+ invalid_param = true;
570
+ break;
571
+ }
572
+ auto p = parse_int_range(argv[i]);
573
+ params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
574
+ } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
575
+ if (++i >= argc) {
576
+ invalid_param = true;
577
+ break;
578
+ }
579
+ params.rpc_servers.push_back(argv[i]);
580
+ } else if (arg == "-sm" || arg == "--split-mode") {
581
+ if (++i >= argc) {
582
+ invalid_param = true;
583
+ break;
584
+ }
585
+ auto p = string_split<std::string>(argv[i], split_delim);
586
+
587
+ std::vector<llama_split_mode> modes;
588
+ for (const auto & m : p) {
589
+ llama_split_mode mode;
590
+ if (m == "none") {
591
+ mode = LLAMA_SPLIT_MODE_NONE;
592
+ } else if (m == "layer") {
593
+ mode = LLAMA_SPLIT_MODE_LAYER;
594
+ } else if (m == "row") {
595
+ mode = LLAMA_SPLIT_MODE_ROW;
596
+ } else {
597
+ invalid_param = true;
598
+ break;
599
+ }
600
+ modes.push_back(mode);
601
+ }
602
+ if (invalid_param) {
603
+ break;
604
+ }
605
+ params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
606
+ } else if (arg == "-mg" || arg == "--main-gpu") {
607
+ if (++i >= argc) {
608
+ invalid_param = true;
609
+ break;
610
+ }
611
+ params.main_gpu = parse_int_range(argv[i]);
612
+ } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
613
+ if (++i >= argc) {
614
+ invalid_param = true;
615
+ break;
616
+ }
617
+ auto p = string_split<bool>(argv[i], split_delim);
618
+ params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
619
+ } else if (arg == "--numa") {
620
+ if (++i >= argc) {
478
621
  invalid_param = true;
479
622
  break;
480
623
  }
481
- modes.push_back(mode);
482
- }
483
- if (invalid_param) {
484
- break;
485
- }
486
- params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
487
- } else if (arg == "-mg" || arg == "--main-gpu") {
488
- if (++i >= argc) {
489
- invalid_param = true;
490
- break;
491
- }
492
- params.main_gpu = string_split<int>(argv[i], split_delim);
493
- } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
494
- if (++i >= argc) {
495
- invalid_param = true;
496
- break;
497
- }
498
- auto p = string_split<bool>(argv[i], split_delim);
499
- params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
500
- } else if (arg == "--numa") {
501
- if (++i >= argc) {
502
- invalid_param = true;
503
- break;
504
- } else {
505
624
  std::string value(argv[i]);
506
- /**/ if (value == "distribute" || value == "") {
625
+ if (value == "distribute" || value == "") {
507
626
  params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
508
627
  } else if (value == "isolate") {
509
628
  params.numa = GGML_NUMA_STRATEGY_ISOLATE;
@@ -513,89 +632,183 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
513
632
  invalid_param = true;
514
633
  break;
515
634
  }
516
- }
517
- } else if (arg == "-fa" || arg == "--flash-attn") {
518
- if (++i >= argc) {
519
- invalid_param = true;
520
- break;
521
- }
522
- auto p = string_split<bool>(argv[i], split_delim);
523
- params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
524
- } else if (arg == "-mmp" || arg == "--mmap") {
525
- if (++i >= argc) {
526
- invalid_param = true;
527
- break;
528
- }
529
- auto p = string_split<bool>(argv[i], split_delim);
530
- params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
531
- } else if (arg == "-embd" || arg == "--embeddings") {
532
- if (++i >= argc) {
533
- invalid_param = true;
534
- break;
535
- }
536
- auto p = string_split<bool>(argv[i], split_delim);
537
- params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
538
- } else if (arg == "-ts" || arg == "--tensor-split") {
539
- if (++i >= argc) {
540
- invalid_param = true;
541
- break;
542
- }
543
- for (auto ts : string_split<std::string>(argv[i], split_delim)) {
544
- // split string by ; and /
545
- const std::regex regex{ R"([;/]+)" };
546
- std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
547
- std::vector<std::string> split_arg{ it, {} };
548
- GGML_ASSERT(split_arg.size() <= llama_max_devices());
549
-
550
- std::vector<float> tensor_split(llama_max_devices());
551
- for (size_t i = 0; i < llama_max_devices(); ++i) {
552
- if (i < split_arg.size()) {
553
- tensor_split[i] = std::stof(split_arg[i]);
635
+ } else if (arg == "-fa" || arg == "--flash-attn") {
636
+ if (++i >= argc) {
637
+ invalid_param = true;
638
+ break;
639
+ }
640
+ auto p = string_split<bool>(argv[i], split_delim);
641
+ params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
642
+ } else if (arg == "-mmp" || arg == "--mmap") {
643
+ if (++i >= argc) {
644
+ invalid_param = true;
645
+ break;
646
+ }
647
+ auto p = string_split<bool>(argv[i], split_delim);
648
+ params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
649
+ } else if (arg == "-embd" || arg == "--embeddings") {
650
+ if (++i >= argc) {
651
+ invalid_param = true;
652
+ break;
653
+ }
654
+ auto p = string_split<bool>(argv[i], split_delim);
655
+ params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
656
+ } else if (arg == "-nopo" || arg == "--no-op-offload") {
657
+ if (++i >= argc) {
658
+ invalid_param = true;
659
+ break;
660
+ }
661
+ auto p = string_split<bool>(argv[i], split_delim);
662
+ params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
663
+ } else if (arg == "-ts" || arg == "--tensor-split") {
664
+ if (++i >= argc) {
665
+ invalid_param = true;
666
+ break;
667
+ }
668
+ for (auto ts : string_split<std::string>(argv[i], split_delim)) {
669
+ // split string by ; and /
670
+ const std::regex regex{ R"([;/]+)" };
671
+ std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
672
+ std::vector<std::string> split_arg{ it, {} };
673
+ GGML_ASSERT(split_arg.size() <= llama_max_devices());
674
+
675
+ std::vector<float> tensor_split(llama_max_devices());
676
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
677
+ if (i < split_arg.size()) {
678
+ tensor_split[i] = std::stof(split_arg[i]);
679
+ } else {
680
+ tensor_split[i] = 0.0f;
681
+ }
682
+ }
683
+ params.tensor_split.push_back(tensor_split);
684
+ }
685
+ } else if (arg == "-ot" || arg == "--override-tensor") {
686
+ if (++i >= argc) {
687
+ invalid_param = true;
688
+ break;
689
+ }
690
+ auto * value = argv[i];
691
+ /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
692
+ if (buft_list.empty()) {
693
+ // enumerate all the devices and add their buffer types to the list
694
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
695
+ auto * dev = ggml_backend_dev_get(i);
696
+ auto * buft = ggml_backend_dev_buffer_type(dev);
697
+ if (buft) {
698
+ buft_list[ggml_backend_buft_name(buft)] = buft;
699
+ }
700
+ }
701
+ }
702
+ auto override_group_span_len = std::strcspn(value, ",");
703
+ bool last_group = false;
704
+ do {
705
+ if (override_group_span_len == 0) {
706
+ // Adds an empty override-tensors for an empty span
707
+ params.tensor_buft_overrides.push_back({{}});
708
+ if (value[override_group_span_len] == '\0') {
709
+ value = &value[override_group_span_len];
710
+ last_group = true;
711
+ } else {
712
+ value = &value[override_group_span_len + 1];
713
+ override_group_span_len = std::strcspn(value, ",");
714
+ }
715
+ continue;
716
+ }
717
+ // Stamps null terminators into the argv
718
+ // value for this option to avoid the
719
+ // memory leak present in the implementation
720
+ // over in arg.cpp. Acceptable because we
721
+ // only parse these args once in this program.
722
+ auto * override_group = value;
723
+ if (value[override_group_span_len] == '\0') {
724
+ value = &value[override_group_span_len];
725
+ last_group = true;
554
726
  } else {
555
- tensor_split[i] = 0.0f;
727
+ value[override_group_span_len] = '\0';
728
+ value = &value[override_group_span_len + 1];
729
+ }
730
+ std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
731
+ auto override_span_len = std::strcspn(override_group, ";");
732
+ while (override_span_len > 0) {
733
+ auto * override = override_group;
734
+ if (override_group[override_span_len] != '\0') {
735
+ override_group[override_span_len] = '\0';
736
+ override_group = &override_group[override_span_len + 1];
737
+ } else {
738
+ override_group = &override_group[override_span_len];
739
+ }
740
+ auto tensor_name_span_len = std::strcspn(override, "=");
741
+ if (tensor_name_span_len >= override_span_len) {
742
+ invalid_param = true;
743
+ break;
744
+ }
745
+ override[tensor_name_span_len] = '\0';
746
+ auto * tensor_name = override;
747
+ auto * buffer_type = &override[tensor_name_span_len + 1];
748
+ if (buft_list.find(buffer_type) == buft_list.end()) {
749
+ printf("error: unrecognized buffer type '%s'\n", buffer_type);
750
+ printf("Available buffer types:\n");
751
+ for (const auto & it : buft_list) {
752
+ printf(" %s\n", ggml_backend_buft_name(it.second));
753
+ }
754
+ invalid_param = true;
755
+ break;
756
+ }
757
+ group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
758
+ override_span_len = std::strcspn(override_group, ";");
556
759
  }
760
+ if (invalid_param) {
761
+ break;
762
+ }
763
+ group_tensor_buft_overrides.push_back({nullptr,nullptr});
764
+ params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
765
+ override_group_span_len = std::strcspn(value, ",");
766
+ } while (!last_group);
767
+ } else if (arg == "-r" || arg == "--repetitions") {
768
+ if (++i >= argc) {
769
+ invalid_param = true;
770
+ break;
557
771
  }
558
- params.tensor_split.push_back(tensor_split);
559
- }
560
- } else if (arg == "-r" || arg == "--repetitions") {
561
- if (++i >= argc) {
562
- invalid_param = true;
563
- break;
564
- }
565
- params.reps = std::stoi(argv[i]);
566
- } else if (arg == "--prio") {
567
- if (++i >= argc) {
568
- invalid_param = true;
569
- break;
570
- }
571
- params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
572
- } else if (arg == "--delay") {
573
- if (++i >= argc) {
574
- invalid_param = true;
575
- break;
576
- }
577
- params.delay = std::stoi(argv[i]);
578
- } else if (arg == "-o" || arg == "--output") {
579
- if (++i >= argc) {
580
- invalid_param = true;
581
- break;
582
- }
583
- invalid_param = !output_format_from_str(argv[i], params.output_format);
584
- } else if (arg == "-oe" || arg == "--output-err") {
585
- if (++i >= argc) {
772
+ params.reps = std::stoi(argv[i]);
773
+ } else if (arg == "--prio") {
774
+ if (++i >= argc) {
775
+ invalid_param = true;
776
+ break;
777
+ }
778
+ params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
779
+ } else if (arg == "--delay") {
780
+ if (++i >= argc) {
781
+ invalid_param = true;
782
+ break;
783
+ }
784
+ params.delay = std::stoi(argv[i]);
785
+ } else if (arg == "-o" || arg == "--output") {
786
+ if (++i >= argc) {
787
+ invalid_param = true;
788
+ break;
789
+ }
790
+ invalid_param = !output_format_from_str(argv[i], params.output_format);
791
+ } else if (arg == "-oe" || arg == "--output-err") {
792
+ if (++i >= argc) {
793
+ invalid_param = true;
794
+ break;
795
+ }
796
+ invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
797
+ } else if (arg == "-v" || arg == "--verbose") {
798
+ params.verbose = true;
799
+ } else if (arg == "--progress") {
800
+ params.progress = true;
801
+ } else {
586
802
  invalid_param = true;
587
803
  break;
588
804
  }
589
- invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
590
- } else if (arg == "-v" || arg == "--verbose") {
591
- params.verbose = true;
592
- } else if (arg == "--progress") {
593
- params.progress = true;
594
- } else {
805
+ } catch (const std::exception & e) {
806
+ fprintf(stderr, "error: %s\n", e.what());
595
807
  invalid_param = true;
596
808
  break;
597
809
  }
598
810
  }
811
+
599
812
  if (invalid_param) {
600
813
  fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
601
814
  print_usage(argc, argv);
@@ -615,6 +828,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
615
828
  if (params.n_pg.empty()) {
616
829
  params.n_pg = cmd_params_defaults.n_pg;
617
830
  }
831
+ if (params.n_depth.empty()) {
832
+ params.n_depth = cmd_params_defaults.n_depth;
833
+ }
618
834
  if (params.n_batch.empty()) {
619
835
  params.n_batch = cmd_params_defaults.n_batch;
620
836
  }
@@ -627,6 +843,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
627
843
  if (params.type_v.empty()) {
628
844
  params.type_v = cmd_params_defaults.type_v;
629
845
  }
846
+ if (params.defrag_thold.empty()) {
847
+ params.defrag_thold = cmd_params_defaults.defrag_thold;
848
+ }
630
849
  if (params.n_gpu_layers.empty()) {
631
850
  params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
632
851
  }
@@ -648,12 +867,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
648
867
  if (params.tensor_split.empty()) {
649
868
  params.tensor_split = cmd_params_defaults.tensor_split;
650
869
  }
870
+ if (params.tensor_buft_overrides.empty()) {
871
+ params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
872
+ }
651
873
  if (params.use_mmap.empty()) {
652
874
  params.use_mmap = cmd_params_defaults.use_mmap;
653
875
  }
654
876
  if (params.embeddings.empty()) {
655
877
  params.embeddings = cmd_params_defaults.embeddings;
656
878
  }
879
+ if (params.no_op_offload.empty()) {
880
+ params.no_op_offload = cmd_params_defaults.no_op_offload;
881
+ }
657
882
  if (params.n_threads.empty()) {
658
883
  params.n_threads = cmd_params_defaults.n_threads;
659
884
  }
@@ -674,10 +899,12 @@ struct cmd_params_instance {
674
899
  std::string model;
675
900
  int n_prompt;
676
901
  int n_gen;
902
+ int n_depth;
677
903
  int n_batch;
678
904
  int n_ubatch;
679
905
  ggml_type type_k;
680
906
  ggml_type type_v;
907
+ float defrag_thold;
681
908
  int n_threads;
682
909
  std::string cpu_mask;
683
910
  bool cpu_strict;
@@ -689,8 +916,10 @@ struct cmd_params_instance {
689
916
  bool no_kv_offload;
690
917
  bool flash_attn;
691
918
  std::vector<float> tensor_split;
919
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
692
920
  bool use_mmap;
693
921
  bool embeddings;
922
+ bool no_op_offload;
694
923
 
695
924
  llama_model_params to_llama_mparams() const {
696
925
  llama_model_params mparams = llama_model_default_params();
@@ -733,26 +962,35 @@ struct cmd_params_instance {
733
962
  mparams.tensor_split = tensor_split.data();
734
963
  mparams.use_mmap = use_mmap;
735
964
 
965
+ if (tensor_buft_overrides.empty()) {
966
+ mparams.tensor_buft_overrides = nullptr;
967
+ } else {
968
+ GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
969
+ mparams.tensor_buft_overrides = tensor_buft_overrides.data();
970
+ }
971
+
736
972
  return mparams;
737
973
  }
738
974
 
739
975
  bool equal_mparams(const cmd_params_instance & other) const {
740
976
  return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
741
977
  split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
742
- tensor_split == other.tensor_split;
978
+ tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
743
979
  }
744
980
 
745
981
  llama_context_params to_llama_cparams() const {
746
982
  llama_context_params cparams = llama_context_default_params();
747
983
 
748
- cparams.n_ctx = n_prompt + n_gen;
749
- cparams.n_batch = n_batch;
750
- cparams.n_ubatch = n_ubatch;
751
- cparams.type_k = type_k;
752
- cparams.type_v = type_v;
753
- cparams.offload_kqv = !no_kv_offload;
754
- cparams.flash_attn = flash_attn;
755
- cparams.embeddings = embeddings;
984
+ cparams.n_ctx = n_prompt + n_gen + n_depth;
985
+ cparams.n_batch = n_batch;
986
+ cparams.n_ubatch = n_ubatch;
987
+ cparams.type_k = type_k;
988
+ cparams.type_v = type_v;
989
+ cparams.defrag_thold = defrag_thold;
990
+ cparams.offload_kqv = !no_kv_offload;
991
+ cparams.flash_attn = flash_attn;
992
+ cparams.embeddings = embeddings;
993
+ cparams.op_offload = !no_op_offload;
756
994
 
757
995
  return cparams;
758
996
  }
@@ -769,17 +1007,21 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
769
1007
  for (const auto & sm : params.split_mode)
770
1008
  for (const auto & mg : params.main_gpu)
771
1009
  for (const auto & ts : params.tensor_split)
1010
+ for (const auto & ot : params.tensor_buft_overrides)
772
1011
  for (const auto & mmp : params.use_mmap)
773
1012
  for (const auto & embd : params.embeddings)
1013
+ for (const auto & nopo : params.no_op_offload)
774
1014
  for (const auto & nb : params.n_batch)
775
1015
  for (const auto & nub : params.n_ubatch)
776
1016
  for (const auto & tk : params.type_k)
777
1017
  for (const auto & tv : params.type_v)
1018
+ for (const auto & defrag_thold : params.defrag_thold)
778
1019
  for (const auto & nkvo : params.no_kv_offload)
779
1020
  for (const auto & fa : params.flash_attn)
780
1021
  for (const auto & nt : params.n_threads)
781
1022
  for (const auto & cm : params.cpu_mask)
782
1023
  for (const auto & cs : params.cpu_strict)
1024
+ for (const auto & nd : params.n_depth)
783
1025
  for (const auto & pl : params.poll) {
784
1026
  for (const auto & n_prompt : params.n_prompt) {
785
1027
  if (n_prompt == 0) {
@@ -789,10 +1031,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
789
1031
  /* .model = */ m,
790
1032
  /* .n_prompt = */ n_prompt,
791
1033
  /* .n_gen = */ 0,
1034
+ /* .n_depth = */ nd,
792
1035
  /* .n_batch = */ nb,
793
1036
  /* .n_ubatch = */ nub,
794
1037
  /* .type_k = */ tk,
795
1038
  /* .type_v = */ tv,
1039
+ /* .defrag_thold = */ defrag_thold,
796
1040
  /* .n_threads = */ nt,
797
1041
  /* .cpu_mask = */ cm,
798
1042
  /* .cpu_strict = */ cs,
@@ -804,8 +1048,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
804
1048
  /* .no_kv_offload= */ nkvo,
805
1049
  /* .flash_attn = */ fa,
806
1050
  /* .tensor_split = */ ts,
1051
+ /* .tensor_buft_overrides = */ ot,
807
1052
  /* .use_mmap = */ mmp,
808
1053
  /* .embeddings = */ embd,
1054
+ /* .no_op_offload= */ nopo,
809
1055
  };
810
1056
  instances.push_back(instance);
811
1057
  }
@@ -818,10 +1064,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
818
1064
  /* .model = */ m,
819
1065
  /* .n_prompt = */ 0,
820
1066
  /* .n_gen = */ n_gen,
1067
+ /* .n_depth = */ nd,
821
1068
  /* .n_batch = */ nb,
822
1069
  /* .n_ubatch = */ nub,
823
1070
  /* .type_k = */ tk,
824
1071
  /* .type_v = */ tv,
1072
+ /* .defrag_thold = */ defrag_thold,
825
1073
  /* .n_threads = */ nt,
826
1074
  /* .cpu_mask = */ cm,
827
1075
  /* .cpu_strict = */ cs,
@@ -833,8 +1081,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
833
1081
  /* .no_kv_offload= */ nkvo,
834
1082
  /* .flash_attn = */ fa,
835
1083
  /* .tensor_split = */ ts,
1084
+ /* .tensor_buft_overrides = */ ot,
836
1085
  /* .use_mmap = */ mmp,
837
1086
  /* .embeddings = */ embd,
1087
+ /* .no_op_offload= */ nopo,
838
1088
  };
839
1089
  instances.push_back(instance);
840
1090
  }
@@ -847,10 +1097,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
847
1097
  /* .model = */ m,
848
1098
  /* .n_prompt = */ n_pg.first,
849
1099
  /* .n_gen = */ n_pg.second,
1100
+ /* .n_depth = */ nd,
850
1101
  /* .n_batch = */ nb,
851
1102
  /* .n_ubatch = */ nub,
852
1103
  /* .type_k = */ tk,
853
1104
  /* .type_v = */ tv,
1105
+ /* .defrag_thold = */ defrag_thold,
854
1106
  /* .n_threads = */ nt,
855
1107
  /* .cpu_mask = */ cm,
856
1108
  /* .cpu_strict = */ cs,
@@ -862,8 +1114,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
862
1114
  /* .no_kv_offload= */ nkvo,
863
1115
  /* .flash_attn = */ fa,
864
1116
  /* .tensor_split = */ ts,
1117
+ /* .tensor_buft_overrides = */ ot,
865
1118
  /* .use_mmap = */ mmp,
866
1119
  /* .embeddings = */ embd,
1120
+ /* .no_op_offload= */ nopo,
867
1121
  };
868
1122
  instances.push_back(instance);
869
1123
  }
@@ -890,16 +1144,20 @@ struct test {
890
1144
  int poll;
891
1145
  ggml_type type_k;
892
1146
  ggml_type type_v;
1147
+ float defrag_thold;
893
1148
  int n_gpu_layers;
894
1149
  llama_split_mode split_mode;
895
1150
  int main_gpu;
896
1151
  bool no_kv_offload;
897
1152
  bool flash_attn;
898
1153
  std::vector<float> tensor_split;
1154
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
899
1155
  bool use_mmap;
900
1156
  bool embeddings;
1157
+ bool no_op_offload;
901
1158
  int n_prompt;
902
1159
  int n_gen;
1160
+ int n_depth;
903
1161
  std::string test_time;
904
1162
  std::vector<uint64_t> samples_ns;
905
1163
 
@@ -921,16 +1179,20 @@ struct test {
921
1179
  poll = inst.poll;
922
1180
  type_k = inst.type_k;
923
1181
  type_v = inst.type_v;
1182
+ defrag_thold = inst.defrag_thold;
924
1183
  n_gpu_layers = inst.n_gpu_layers;
925
1184
  split_mode = inst.split_mode;
926
1185
  main_gpu = inst.main_gpu;
927
1186
  no_kv_offload = inst.no_kv_offload;
928
1187
  flash_attn = inst.flash_attn;
929
1188
  tensor_split = inst.tensor_split;
1189
+ tensor_buft_overrides = inst.tensor_buft_overrides;
930
1190
  use_mmap = inst.use_mmap;
931
1191
  embeddings = inst.embeddings;
1192
+ no_op_offload = inst.no_op_offload;
932
1193
  n_prompt = inst.n_prompt;
933
1194
  n_gen = inst.n_gen;
1195
+ n_depth = inst.n_depth;
934
1196
  // RFC 3339 date-time format
935
1197
  time_t t = time(NULL);
936
1198
  std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
@@ -972,9 +1234,10 @@ struct test {
972
1234
  "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
973
1235
  "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
974
1236
  "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
975
- "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
976
- "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
977
- "avg_ts", "stddev_ts",
1237
+ "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1238
+ "defrag_thold",
1239
+ "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
1240
+ "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
978
1241
  };
979
1242
  return fields;
980
1243
  }
@@ -984,15 +1247,15 @@ struct test {
984
1247
  static field_type get_field_type(const std::string & field) {
985
1248
  if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
986
1249
  field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
987
- field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
988
- field == "stddev_ns") {
1250
+ field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
1251
+ field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
989
1252
  return INT;
990
1253
  }
991
1254
  if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
992
1255
  field == "use_mmap" || field == "embeddings") {
993
1256
  return BOOL;
994
1257
  }
995
- if (field == "avg_ts" || field == "stddev_ts") {
1258
+ if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
996
1259
  return FLOAT;
997
1260
  }
998
1261
  return STRING;
@@ -1000,6 +1263,7 @@ struct test {
1000
1263
 
1001
1264
  std::vector<std::string> get_values() const {
1002
1265
  std::string tensor_split_str;
1266
+ std::string tensor_buft_overrides_str;
1003
1267
  int max_nonzero = 0;
1004
1268
  for (size_t i = 0; i < llama_max_devices(); i++) {
1005
1269
  if (tensor_split[i] > 0) {
@@ -1014,6 +1278,26 @@ struct test {
1014
1278
  tensor_split_str += "/";
1015
1279
  }
1016
1280
  }
1281
+ if (tensor_buft_overrides.size() == 1) {
1282
+ // Last element of tensor_buft_overrides is always a null pattern
1283
+ // so if it is only one element long, it must be a null pattern.
1284
+ GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
1285
+ tensor_buft_overrides_str += "none";
1286
+ } else {
1287
+ for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
1288
+ // Last element of tensor_buft_overrides is always a null pattern
1289
+ if (tensor_buft_overrides[i].pattern == nullptr) {
1290
+ tensor_buft_overrides_str += "none";
1291
+ } else {
1292
+ tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
1293
+ tensor_buft_overrides_str += "=";
1294
+ tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
1295
+ }
1296
+ if (i + 2 < tensor_buft_overrides.size()) {
1297
+ tensor_buft_overrides_str += ";";
1298
+ }
1299
+ }
1300
+ }
1017
1301
  std::vector<std::string> values = { build_commit,
1018
1302
  std::to_string(build_number),
1019
1303
  cpu_info,
@@ -1037,10 +1321,14 @@ struct test {
1037
1321
  std::to_string(no_kv_offload),
1038
1322
  std::to_string(flash_attn),
1039
1323
  tensor_split_str,
1324
+ tensor_buft_overrides_str,
1325
+ std::to_string(defrag_thold),
1040
1326
  std::to_string(use_mmap),
1041
1327
  std::to_string(embeddings),
1328
+ std::to_string(no_op_offload),
1042
1329
  std::to_string(n_prompt),
1043
1330
  std::to_string(n_gen),
1331
+ std::to_string(n_depth),
1044
1332
  test_time,
1045
1333
  std::to_string(avg_ns()),
1046
1334
  std::to_string(stdev_ns()),
@@ -1218,7 +1506,10 @@ struct markdown_printer : public printer {
1218
1506
  return 4;
1219
1507
  }
1220
1508
  if (field == "test") {
1221
- return 13;
1509
+ return 15;
1510
+ }
1511
+ if (field == "no_op_offload") {
1512
+ return 4;
1222
1513
  }
1223
1514
 
1224
1515
  int width = std::max((int) field.length(), 10);
@@ -1251,9 +1542,15 @@ struct markdown_printer : public printer {
1251
1542
  if (field == "embeddings") {
1252
1543
  return "embd";
1253
1544
  }
1545
+ if (field == "no_op_offload") {
1546
+ return "nopo";
1547
+ }
1254
1548
  if (field == "tensor_split") {
1255
1549
  return "ts";
1256
1550
  }
1551
+ if (field == "tensor_buft_overrides") {
1552
+ return "ot";
1553
+ }
1257
1554
  return field;
1258
1555
  }
1259
1556
 
@@ -1292,6 +1589,9 @@ struct markdown_printer : public printer {
1292
1589
  if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
1293
1590
  fields.emplace_back("type_v");
1294
1591
  }
1592
+ if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
1593
+ fields.emplace_back("defrag_thold");
1594
+ }
1295
1595
  if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
1296
1596
  fields.emplace_back("main_gpu");
1297
1597
  }
@@ -1307,12 +1607,18 @@ struct markdown_printer : public printer {
1307
1607
  if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1308
1608
  fields.emplace_back("tensor_split");
1309
1609
  }
1610
+ if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
1611
+ fields.emplace_back("tensor_buft_overrides");
1612
+ }
1310
1613
  if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
1311
1614
  fields.emplace_back("use_mmap");
1312
1615
  }
1313
1616
  if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
1314
1617
  fields.emplace_back("embeddings");
1315
1618
  }
1619
+ if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
1620
+ fields.emplace_back("no_op_offload");
1621
+ }
1316
1622
  fields.emplace_back("test");
1317
1623
  fields.emplace_back("t/s");
1318
1624
 
@@ -1362,6 +1668,10 @@ struct markdown_printer : public printer {
1362
1668
  } else {
1363
1669
  snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1364
1670
  }
1671
+ if (t.n_depth > 0) {
1672
+ int len = strlen(buf);
1673
+ snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
1674
+ }
1365
1675
  value = buf;
1366
1676
  } else if (field == "t/s") {
1367
1677
  snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
@@ -1427,7 +1737,7 @@ struct sql_printer : public printer {
1427
1737
  }
1428
1738
  };
1429
1739
 
1430
- static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1740
+ static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1431
1741
  llama_set_n_threads(ctx, n_threads, n_threads);
1432
1742
 
1433
1743
  const llama_model * model = llama_get_model(ctx);
@@ -1444,14 +1754,19 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
1444
1754
  for (int i = 1; i < n_tokens; i++) {
1445
1755
  tokens[i] = std::rand() % n_vocab;
1446
1756
  }
1447
- llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1757
+ int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1758
+ if (res != 0) {
1759
+ fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
1760
+ return false;
1761
+ }
1448
1762
  n_processed += n_tokens;
1449
1763
  }
1450
1764
 
1451
1765
  llama_synchronize(ctx);
1766
+ return true;
1452
1767
  }
1453
1768
 
1454
- static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1769
+ static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
1455
1770
  llama_set_n_threads(ctx, n_threads, n_threads);
1456
1771
 
1457
1772
  const llama_model * model = llama_get_model(ctx);
@@ -1461,10 +1776,15 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1461
1776
  llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1462
1777
 
1463
1778
  for (int i = 0; i < n_gen; i++) {
1464
- llama_decode(ctx, llama_batch_get_one(&token, 1));
1779
+ int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
1780
+ if (res != 0) {
1781
+ fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
1782
+ return false;
1783
+ }
1465
1784
  llama_synchronize(ctx);
1466
1785
  token = std::rand() % n_vocab;
1467
1786
  }
1787
+ return true;
1468
1788
  }
1469
1789
 
1470
1790
  static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
@@ -1507,10 +1827,11 @@ int main(int argc, char ** argv) {
1507
1827
  fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
1508
1828
  #endif
1509
1829
 
1510
- cmd_params params = parse_cmd_params(argc, argv);
1511
-
1512
1830
  // initialize backends
1513
1831
  ggml_backend_load_all();
1832
+
1833
+ cmd_params params = parse_cmd_params(argc, argv);
1834
+
1514
1835
  auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1515
1836
  if (!cpu_dev) {
1516
1837
  fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
@@ -1608,18 +1929,38 @@ int main(int argc, char ** argv) {
1608
1929
  fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
1609
1930
  }
1610
1931
  //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1611
- test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1932
+ bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1933
+ if (!res) {
1934
+ fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
1935
+ exit(1);
1936
+ }
1612
1937
  }
1613
1938
  if (t.n_gen > 0) {
1614
1939
  if (params.progress) {
1615
1940
  fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
1616
1941
  }
1617
- test_gen(ctx, 1, t.n_threads);
1942
+ bool res = test_gen(ctx, 1, t.n_threads);
1943
+ if (!res) {
1944
+ fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
1945
+ exit(1);
1946
+ }
1618
1947
  }
1619
1948
 
1620
1949
  for (int i = 0; i < params.reps; i++) {
1621
1950
  llama_kv_self_clear(ctx);
1622
1951
 
1952
+ if (t.n_depth > 0) {
1953
+ if (params.progress) {
1954
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
1955
+ i + 1, params.reps);
1956
+ }
1957
+ bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
1958
+ if (!res) {
1959
+ fprintf(stderr, "%s: error: failed to run depth\n", __func__);
1960
+ exit(1);
1961
+ }
1962
+ }
1963
+
1623
1964
  uint64_t t_start = get_time_ns();
1624
1965
 
1625
1966
  if (t.n_prompt > 0) {
@@ -1627,14 +1968,22 @@ int main(int argc, char ** argv) {
1627
1968
  fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
1628
1969
  i + 1, params.reps);
1629
1970
  }
1630
- test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1971
+ bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1972
+ if (!res) {
1973
+ fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
1974
+ exit(1);
1975
+ }
1631
1976
  }
1632
1977
  if (t.n_gen > 0) {
1633
1978
  if (params.progress) {
1634
1979
  fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
1635
1980
  i + 1, params.reps);
1636
1981
  }
1637
- test_gen(ctx, t.n_gen, t.n_threads);
1982
+ bool res = test_gen(ctx, t.n_gen, t.n_threads);
1983
+ if (!res) {
1984
+ fprintf(stderr, "%s: error: failed to run gen\n", __func__);
1985
+ exit(1);
1986
+ }
1638
1987
  }
1639
1988
 
1640
1989
  uint64_t t_ns = get_time_ns() - t_start;