@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
36
36
  return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
37
37
  }
38
38
 
39
+ static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
40
+ if (a.pattern != b.pattern) {
41
+ // cString comparison that may be null
42
+ if (a.pattern == nullptr || b.pattern == nullptr) {
43
+ return false;
44
+ }
45
+ if (strcmp(a.pattern, b.pattern) != 0) {
46
+ return false;
47
+ }
48
+ }
49
+ if (a.buft != b.buft) {
50
+ return false;
51
+ }
52
+ return true;
53
+ }
54
+
55
+ static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
56
+ if (a.size() != b.size()) {
57
+ return false;
58
+ }
59
+ for (size_t i = 0; i < a.size(); i++) {
60
+ if (!tensor_buft_override_equal(a[i], b[i])) {
61
+ return false;
62
+ }
63
+ }
64
+ return true;
65
+ }
66
+
67
+ static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
68
+ if (a.size() != b.size()) {
69
+ return false;
70
+ }
71
+ for (size_t i = 0; i < a.size(); i++) {
72
+ if (!vec_tensor_buft_override_equal(a[i], b[i])) {
73
+ return false;
74
+ }
75
+ }
76
+ return true;
77
+ }
78
+
39
79
  template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
40
80
  std::ostringstream str;
41
81
  for (size_t i = 0; i < values.size(); i++) {
@@ -160,6 +200,7 @@ struct cmd_params {
160
200
  std::vector<int> n_prompt;
161
201
  std::vector<int> n_gen;
162
202
  std::vector<std::pair<int, int>> n_pg;
203
+ std::vector<int> n_depth;
163
204
  std::vector<int> n_batch;
164
205
  std::vector<int> n_ubatch;
165
206
  std::vector<ggml_type> type_k;
@@ -175,6 +216,7 @@ struct cmd_params {
175
216
  std::vector<bool> no_kv_offload;
176
217
  std::vector<bool> flash_attn;
177
218
  std::vector<std::vector<float>> tensor_split;
219
+ std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
178
220
  std::vector<bool> use_mmap;
179
221
  std::vector<bool> embeddings;
180
222
  ggml_numa_strategy numa;
@@ -192,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
192
234
  /* n_prompt */ { 512 },
193
235
  /* n_gen */ { 128 },
194
236
  /* n_pg */ {},
237
+ /* n_depth */ { 0 },
195
238
  /* n_batch */ { 2048 },
196
239
  /* n_ubatch */ { 512 },
197
240
  /* type_k */ { GGML_TYPE_F16 },
@@ -207,6 +250,7 @@ static const cmd_params cmd_params_defaults = {
207
250
  /* no_kv_offload */ { false },
208
251
  /* flash_attn */ { false },
209
252
  /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
253
+ /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
210
254
  /* use_mmap */ { true },
211
255
  /* embeddings */ { false },
212
256
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
@@ -230,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
230
274
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
231
275
  printf(" -pg <pp,tg> (default: %s)\n",
232
276
  join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
277
+ printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
233
278
  printf(" -b, --batch-size <n> (default: %s)\n",
234
279
  join(cmd_params_defaults.n_batch, ",").c_str());
235
280
  printf(" -ub, --ubatch-size <n> (default: %s)\n",
@@ -265,6 +310,7 @@ static void print_usage(int /* argc */, char ** argv) {
265
310
  printf(" -embd, --embeddings <0|1> (default: %s)\n",
266
311
  join(cmd_params_defaults.embeddings, ",").c_str());
267
312
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
313
+ printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
268
314
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
269
315
  printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
270
316
  printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
@@ -366,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
366
412
  break;
367
413
  }
368
414
  params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
415
+ } else if (arg == "-d" || arg == "--n-depth") {
416
+ if (++i >= argc) {
417
+ invalid_param = true;
418
+ break;
419
+ }
420
+ auto p = string_split<int>(argv[i], split_delim);
421
+ params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
369
422
  } else if (arg == "-b" || arg == "--batch-size") {
370
423
  if (++i >= argc) {
371
424
  invalid_param = true;
@@ -557,6 +610,87 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
557
610
  }
558
611
  params.tensor_split.push_back(tensor_split);
559
612
  }
613
+ } else if (arg == "-ot" || arg == "--override-tensor") {
614
+ if (++i >= argc) {
615
+ invalid_param = true;
616
+ break;
617
+ }
618
+ auto value = argv[i];
619
+ /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
620
+ if (buft_list.empty()) {
621
+ // enumerate all the devices and add their buffer types to the list
622
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
623
+ auto * dev = ggml_backend_dev_get(i);
624
+ auto * buft = ggml_backend_dev_buffer_type(dev);
625
+ if (buft) {
626
+ buft_list[ggml_backend_buft_name(buft)] = buft;
627
+ }
628
+ }
629
+ }
630
+ auto override_group_span_len = std::strcspn(value, ",");
631
+ bool last_group = false;
632
+ do {
633
+ if (override_group_span_len == 0) {
634
+ // Adds an empty override-tensors for an empty span
635
+ params.tensor_buft_overrides.push_back({{}});
636
+ if (value[override_group_span_len] == '\0') {
637
+ value = &value[override_group_span_len];
638
+ last_group = true;
639
+ } else {
640
+ value = &value[override_group_span_len + 1];
641
+ override_group_span_len = std::strcspn(value, ",");
642
+ }
643
+ continue;
644
+ }
645
+ // Stamps null terminators into the argv
646
+ // value for this option to avoid the
647
+ // memory leak present in the implementation
648
+ // over in arg.cpp. Acceptable because we
649
+ // only parse these args once in this program.
650
+ auto override_group = value;
651
+ if (value[override_group_span_len] == '\0') {
652
+ value = &value[override_group_span_len];
653
+ last_group = true;
654
+ } else {
655
+ value[override_group_span_len] = '\0';
656
+ value = &value[override_group_span_len + 1];
657
+ }
658
+ std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
659
+ auto override_span_len = std::strcspn(override_group, ";");
660
+ while (override_span_len > 0) {
661
+ auto override = override_group;
662
+ if (override_group[override_span_len] != '\0') {
663
+ override_group[override_span_len] = '\0';
664
+ override_group = &override_group[override_span_len + 1];
665
+ } else {
666
+ override_group = &override_group[override_span_len];
667
+ }
668
+ auto tensor_name_span_len = std::strcspn(override, "=");
669
+ if (tensor_name_span_len >= override_span_len) {
670
+ invalid_param = true;
671
+ break;
672
+ }
673
+ override[tensor_name_span_len] = '\0';
674
+ auto tensor_name = override;
675
+ auto buffer_type = &override[tensor_name_span_len + 1];
676
+ if (buft_list.find(buffer_type) == buft_list.end()) {
677
+ printf("Available buffer types:\n");
678
+ for (const auto & it : buft_list) {
679
+ printf(" %s\n", ggml_backend_buft_name(it.second));
680
+ }
681
+ invalid_param = true;
682
+ break;
683
+ }
684
+ group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
685
+ override_span_len = std::strcspn(override_group, ";");
686
+ }
687
+ if (invalid_param) {
688
+ break;
689
+ }
690
+ group_tensor_buft_overrides.push_back({nullptr,nullptr});
691
+ params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
692
+ override_group_span_len = std::strcspn(value, ",");
693
+ } while (!last_group);
560
694
  } else if (arg == "-r" || arg == "--repetitions") {
561
695
  if (++i >= argc) {
562
696
  invalid_param = true;
@@ -615,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
615
749
  if (params.n_pg.empty()) {
616
750
  params.n_pg = cmd_params_defaults.n_pg;
617
751
  }
752
+ if (params.n_depth.empty()) {
753
+ params.n_depth = cmd_params_defaults.n_depth;
754
+ }
618
755
  if (params.n_batch.empty()) {
619
756
  params.n_batch = cmd_params_defaults.n_batch;
620
757
  }
@@ -648,6 +785,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
648
785
  if (params.tensor_split.empty()) {
649
786
  params.tensor_split = cmd_params_defaults.tensor_split;
650
787
  }
788
+ if (params.tensor_buft_overrides.empty()) {
789
+ params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
790
+ }
651
791
  if (params.use_mmap.empty()) {
652
792
  params.use_mmap = cmd_params_defaults.use_mmap;
653
793
  }
@@ -674,6 +814,7 @@ struct cmd_params_instance {
674
814
  std::string model;
675
815
  int n_prompt;
676
816
  int n_gen;
817
+ int n_depth;
677
818
  int n_batch;
678
819
  int n_ubatch;
679
820
  ggml_type type_k;
@@ -689,6 +830,7 @@ struct cmd_params_instance {
689
830
  bool no_kv_offload;
690
831
  bool flash_attn;
691
832
  std::vector<float> tensor_split;
833
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
692
834
  bool use_mmap;
693
835
  bool embeddings;
694
836
 
@@ -733,19 +875,26 @@ struct cmd_params_instance {
733
875
  mparams.tensor_split = tensor_split.data();
734
876
  mparams.use_mmap = use_mmap;
735
877
 
878
+ if (tensor_buft_overrides.empty()) {
879
+ mparams.tensor_buft_overrides = nullptr;
880
+ } else {
881
+ GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
882
+ mparams.tensor_buft_overrides = tensor_buft_overrides.data();
883
+ }
884
+
736
885
  return mparams;
737
886
  }
738
887
 
739
888
  bool equal_mparams(const cmd_params_instance & other) const {
740
889
  return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
741
890
  split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
742
- tensor_split == other.tensor_split;
891
+ tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
743
892
  }
744
893
 
745
894
  llama_context_params to_llama_cparams() const {
746
895
  llama_context_params cparams = llama_context_default_params();
747
896
 
748
- cparams.n_ctx = n_prompt + n_gen;
897
+ cparams.n_ctx = n_prompt + n_gen + n_depth;
749
898
  cparams.n_batch = n_batch;
750
899
  cparams.n_ubatch = n_ubatch;
751
900
  cparams.type_k = type_k;
@@ -769,6 +918,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
769
918
  for (const auto & sm : params.split_mode)
770
919
  for (const auto & mg : params.main_gpu)
771
920
  for (const auto & ts : params.tensor_split)
921
+ for (const auto & ot : params.tensor_buft_overrides)
772
922
  for (const auto & mmp : params.use_mmap)
773
923
  for (const auto & embd : params.embeddings)
774
924
  for (const auto & nb : params.n_batch)
@@ -780,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
780
930
  for (const auto & nt : params.n_threads)
781
931
  for (const auto & cm : params.cpu_mask)
782
932
  for (const auto & cs : params.cpu_strict)
933
+ for (const auto & nd : params.n_depth)
783
934
  for (const auto & pl : params.poll) {
784
935
  for (const auto & n_prompt : params.n_prompt) {
785
936
  if (n_prompt == 0) {
@@ -789,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
789
940
  /* .model = */ m,
790
941
  /* .n_prompt = */ n_prompt,
791
942
  /* .n_gen = */ 0,
943
+ /* .n_depth = */ nd,
792
944
  /* .n_batch = */ nb,
793
945
  /* .n_ubatch = */ nub,
794
946
  /* .type_k = */ tk,
@@ -804,6 +956,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
804
956
  /* .no_kv_offload= */ nkvo,
805
957
  /* .flash_attn = */ fa,
806
958
  /* .tensor_split = */ ts,
959
+ /* .tensor_buft_overrides = */ ot,
807
960
  /* .use_mmap = */ mmp,
808
961
  /* .embeddings = */ embd,
809
962
  };
@@ -818,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
818
971
  /* .model = */ m,
819
972
  /* .n_prompt = */ 0,
820
973
  /* .n_gen = */ n_gen,
974
+ /* .n_depth = */ nd,
821
975
  /* .n_batch = */ nb,
822
976
  /* .n_ubatch = */ nub,
823
977
  /* .type_k = */ tk,
@@ -833,6 +987,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
833
987
  /* .no_kv_offload= */ nkvo,
834
988
  /* .flash_attn = */ fa,
835
989
  /* .tensor_split = */ ts,
990
+ /* .tensor_buft_overrides = */ ot,
836
991
  /* .use_mmap = */ mmp,
837
992
  /* .embeddings = */ embd,
838
993
  };
@@ -847,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
847
1002
  /* .model = */ m,
848
1003
  /* .n_prompt = */ n_pg.first,
849
1004
  /* .n_gen = */ n_pg.second,
1005
+ /* .n_depth = */ nd,
850
1006
  /* .n_batch = */ nb,
851
1007
  /* .n_ubatch = */ nub,
852
1008
  /* .type_k = */ tk,
@@ -862,6 +1018,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
862
1018
  /* .no_kv_offload= */ nkvo,
863
1019
  /* .flash_attn = */ fa,
864
1020
  /* .tensor_split = */ ts,
1021
+ /* .tensor_buft_overrides = */ ot,
865
1022
  /* .use_mmap = */ mmp,
866
1023
  /* .embeddings = */ embd,
867
1024
  };
@@ -896,10 +1053,12 @@ struct test {
896
1053
  bool no_kv_offload;
897
1054
  bool flash_attn;
898
1055
  std::vector<float> tensor_split;
1056
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
899
1057
  bool use_mmap;
900
1058
  bool embeddings;
901
1059
  int n_prompt;
902
1060
  int n_gen;
1061
+ int n_depth;
903
1062
  std::string test_time;
904
1063
  std::vector<uint64_t> samples_ns;
905
1064
 
@@ -927,10 +1086,12 @@ struct test {
927
1086
  no_kv_offload = inst.no_kv_offload;
928
1087
  flash_attn = inst.flash_attn;
929
1088
  tensor_split = inst.tensor_split;
1089
+ tensor_buft_overrides = inst.tensor_buft_overrides;
930
1090
  use_mmap = inst.use_mmap;
931
1091
  embeddings = inst.embeddings;
932
1092
  n_prompt = inst.n_prompt;
933
1093
  n_gen = inst.n_gen;
1094
+ n_depth = inst.n_depth;
934
1095
  // RFC 3339 date-time format
935
1096
  time_t t = time(NULL);
936
1097
  std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
@@ -972,9 +1133,9 @@ struct test {
972
1133
  "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
973
1134
  "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
974
1135
  "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
975
- "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
976
- "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
977
- "avg_ts", "stddev_ts",
1136
+ "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1137
+ "use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
1138
+ "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
978
1139
  };
979
1140
  return fields;
980
1141
  }
@@ -984,8 +1145,8 @@ struct test {
984
1145
  static field_type get_field_type(const std::string & field) {
985
1146
  if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
986
1147
  field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
987
- field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
988
- field == "stddev_ns") {
1148
+ field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
1149
+ field == "avg_ns" || field == "stddev_ns") {
989
1150
  return INT;
990
1151
  }
991
1152
  if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1000,6 +1161,7 @@ struct test {
1000
1161
 
1001
1162
  std::vector<std::string> get_values() const {
1002
1163
  std::string tensor_split_str;
1164
+ std::string tensor_buft_overrides_str;
1003
1165
  int max_nonzero = 0;
1004
1166
  for (size_t i = 0; i < llama_max_devices(); i++) {
1005
1167
  if (tensor_split[i] > 0) {
@@ -1014,6 +1176,26 @@ struct test {
1014
1176
  tensor_split_str += "/";
1015
1177
  }
1016
1178
  }
1179
+ if (tensor_buft_overrides.size() == 1) {
1180
+ // Last element of tensor_buft_overrides is always a null pattern
1181
+ // so if it is only one element long, it must be a null pattern.
1182
+ GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
1183
+ tensor_buft_overrides_str += "none";
1184
+ } else {
1185
+ for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
1186
+ // Last element of tensor_buft_overrides is always a null pattern
1187
+ if (tensor_buft_overrides[i].pattern == nullptr) {
1188
+ tensor_buft_overrides_str += "none";
1189
+ } else {
1190
+ tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
1191
+ tensor_buft_overrides_str += "=";
1192
+ tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
1193
+ }
1194
+ if (i + 2 < tensor_buft_overrides.size()) {
1195
+ tensor_buft_overrides_str += ";";
1196
+ }
1197
+ }
1198
+ }
1017
1199
  std::vector<std::string> values = { build_commit,
1018
1200
  std::to_string(build_number),
1019
1201
  cpu_info,
@@ -1037,10 +1219,12 @@ struct test {
1037
1219
  std::to_string(no_kv_offload),
1038
1220
  std::to_string(flash_attn),
1039
1221
  tensor_split_str,
1222
+ tensor_buft_overrides_str,
1040
1223
  std::to_string(use_mmap),
1041
1224
  std::to_string(embeddings),
1042
1225
  std::to_string(n_prompt),
1043
1226
  std::to_string(n_gen),
1227
+ std::to_string(n_depth),
1044
1228
  test_time,
1045
1229
  std::to_string(avg_ns()),
1046
1230
  std::to_string(stdev_ns()),
@@ -1218,7 +1402,7 @@ struct markdown_printer : public printer {
1218
1402
  return 4;
1219
1403
  }
1220
1404
  if (field == "test") {
1221
- return 13;
1405
+ return 15;
1222
1406
  }
1223
1407
 
1224
1408
  int width = std::max((int) field.length(), 10);
@@ -1254,6 +1438,9 @@ struct markdown_printer : public printer {
1254
1438
  if (field == "tensor_split") {
1255
1439
  return "ts";
1256
1440
  }
1441
+ if (field == "tensor_buft_overrides") {
1442
+ return "ot";
1443
+ }
1257
1444
  return field;
1258
1445
  }
1259
1446
 
@@ -1307,6 +1494,9 @@ struct markdown_printer : public printer {
1307
1494
  if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1308
1495
  fields.emplace_back("tensor_split");
1309
1496
  }
1497
+ if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
1498
+ fields.emplace_back("tensor_buft_overrides");
1499
+ }
1310
1500
  if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
1311
1501
  fields.emplace_back("use_mmap");
1312
1502
  }
@@ -1362,6 +1552,10 @@ struct markdown_printer : public printer {
1362
1552
  } else {
1363
1553
  snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1364
1554
  }
1555
+ if (t.n_depth > 0) {
1556
+ int len = strlen(buf);
1557
+ snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
1558
+ }
1365
1559
  value = buf;
1366
1560
  } else if (field == "t/s") {
1367
1561
  snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
@@ -1620,6 +1814,14 @@ int main(int argc, char ** argv) {
1620
1814
  for (int i = 0; i < params.reps; i++) {
1621
1815
  llama_kv_self_clear(ctx);
1622
1816
 
1817
+ if (t.n_depth > 0) {
1818
+ if (params.progress) {
1819
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
1820
+ i + 1, params.reps);
1821
+ }
1822
+ test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
1823
+ }
1824
+
1623
1825
  uint64_t t_start = get_time_ns();
1624
1826
 
1625
1827
  if (t.n_prompt > 0) {
@@ -18,6 +18,7 @@ android {
18
18
  }
19
19
  externalNativeBuild {
20
20
  cmake {
21
+ arguments += "-DLLAMA_CURL=OFF"
21
22
  arguments += "-DLLAMA_BUILD_COMMON=ON"
22
23
  arguments += "-DGGML_LLAMAFILE=OFF"
23
24
  arguments += "-DCMAKE_BUILD_TYPE=Release"
@@ -1,3 +1,5 @@
1
+ # llava (legacy)
2
+
1
3
  add_library(llava OBJECT
2
4
  llava.cpp
3
5
  llava.h
@@ -22,40 +24,53 @@ if (BUILD_SHARED_LIBS)
22
24
  install(TARGETS llava_shared LIBRARY)
23
25
  endif()
24
26
 
27
+ # mtmd
28
+
29
+ add_library(mtmd OBJECT
30
+ mtmd.cpp
31
+ mtmd.h
32
+ clip.cpp
33
+ clip.h
34
+ clip-impl.h
35
+ )
36
+
37
+ target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
38
+
39
+ target_include_directories(mtmd PUBLIC .)
40
+ target_include_directories(mtmd PRIVATE ../..)
41
+ target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
42
+
43
+ target_compile_features(mtmd PRIVATE cxx_std_17)
44
+
45
+ add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
46
+ if (BUILD_SHARED_LIBS)
47
+ set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
48
+ target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
49
+ add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
50
+ target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
51
+ install(TARGETS mtmd_shared LIBRARY)
52
+ endif()
53
+
25
54
  if (NOT MSVC)
26
55
  target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
56
+ target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
27
57
  endif()
28
58
 
29
59
  if(TARGET BUILD_INFO)
30
60
  add_dependencies(llava BUILD_INFO)
61
+ add_dependencies(mtmd BUILD_INFO)
31
62
  endif()
32
63
 
33
- set(TARGET llama-llava-cli)
34
- add_executable(${TARGET} llava-cli.cpp)
35
- set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
36
- install(TARGETS ${TARGET} RUNTIME)
37
- target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
38
- target_compile_features(${TARGET} PRIVATE cxx_std_17)
39
-
40
- set(TARGET llama-minicpmv-cli)
41
- add_executable(${TARGET} minicpmv-cli.cpp)
42
- set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
43
- install(TARGETS ${TARGET} RUNTIME)
44
- target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
45
- target_compile_features(${TARGET} PRIVATE cxx_std_17)
46
-
47
- set(TARGET llama-qwen2vl-cli)
48
- add_executable(${TARGET} qwen2vl-cli.cpp)
49
- set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
50
- install(TARGETS ${TARGET} RUNTIME)
51
- target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
52
- target_compile_features(${TARGET} PRIVATE cxx_std_17)
64
+ add_executable(llama-llava-cli deprecation-warning.cpp)
65
+ add_executable(llama-gemma3-cli deprecation-warning.cpp)
66
+ add_executable(llama-minicpmv-cli deprecation-warning.cpp)
67
+ add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
53
68
 
54
- set(TARGET llama-gemma3-cli)
55
- add_executable(${TARGET} gemma3-cli.cpp)
56
- set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
69
+ set(TARGET llama-mtmd-cli)
70
+ add_executable(${TARGET} mtmd-cli.cpp)
71
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
57
72
  install(TARGETS ${TARGET} RUNTIME)
58
- target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
73
+ target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
59
74
  target_compile_features(${TARGET} PRIVATE cxx_std_17)
60
75
 
61
76
  set(TARGET llama-llava-clip-quantize-cli)