@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,11 +1,13 @@
1
1
  #include "common.h"
2
+ //#include "log.h" // TODO: start using log.h
2
3
  #include "llama.h"
3
4
 
4
- #include <cmath>
5
5
  #include <cstdio>
6
+ #include <cstring>
6
7
  #include <fstream>
7
8
  #include <string>
8
9
  #include <vector>
10
+ #include <iostream> // TODO: remove me
9
11
 
10
12
  #if defined(_WIN32)
11
13
  #define WIN32_LEAN_AND_MEAN
@@ -13,25 +15,25 @@
13
15
  #include <shellapi.h> // For CommandLineToArgvW
14
16
  #endif
15
17
 
16
- static void print_usage_information(const char * argv0, FILE * stream) {
17
- fprintf(stream, "usage: %s [options]\n\n", argv0);
18
- fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
19
- fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
20
- fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
21
- fprintf(stream, "to control the behavior of the tokenizer.\n\n");
22
- fprintf(stream, " The possible options are:\n");
23
- fprintf(stream, "\n");
24
- fprintf(stream, " -h, --help print this help and exit\n");
25
- fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
26
- fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
27
- fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
28
- fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
29
- fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
30
- fprintf(stream, " --stdin read prompt from standard input.\n");
31
- fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
32
- fprintf(stream, " --no-parse-special do not parse control tokens.\n");
33
- fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
34
- fprintf(stream, " --show-count print the total number of tokens.\n");
18
+ static void print_usage_information(const char * argv0) {
19
+ printf("usage: %s [options]\n\n", argv0);
20
+ printf("The tokenize program tokenizes a prompt using a given model,\n");
21
+ printf("and prints the resulting tokens to standard output.\n\n");
22
+ printf("It needs a model file, a prompt, and optionally other flags\n");
23
+ printf("to control the behavior of the tokenizer.\n\n");
24
+ printf(" The possible options are:\n");
25
+ printf("\n");
26
+ printf(" -h, --help print this help and exit\n");
27
+ printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
28
+ printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
29
+ printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
30
+ printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
31
+ printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
32
+ printf(" --stdin read prompt from standard input.\n");
33
+ printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
34
+ printf(" --no-parse-special do not parse control tokens.\n");
35
+ printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
36
+ printf(" --show-count print the total number of tokens.\n");
35
37
  }
36
38
 
37
39
  static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
185
187
  const int argc = argv.size();
186
188
 
187
189
  if (argc <= 1) {
188
- print_usage_information(argv[0].c_str(), stderr);
190
+ print_usage_information(argv[0].c_str());
189
191
  return 1;
190
192
  }
191
193
 
@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
214
216
  for (; iarg < argc; ++iarg) {
215
217
  std::string arg{argv[iarg]};
216
218
  if (arg == "-h" || arg == "--help") {
217
- print_usage_information(argv[0].c_str(), stdout);
219
+ print_usage_information(argv[0].c_str());
218
220
  return 0;
219
221
  }
220
222
  else if (arg == "--ids") {
@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
323
325
  // Start actually doing the tokenizing stuff.
324
326
  //////
325
327
 
326
- #ifdef LOG_DISABLE_LOGS
327
- disable_logging = true;
328
- #endif
329
-
330
328
  if (disable_logging) {
331
329
  llama_log_set(llama_log_callback_null, NULL);
332
330
  }
@@ -362,12 +360,12 @@ int main(int raw_argc, char ** raw_argv) {
362
360
  prompt = stdin_buffer.str();
363
361
  }
364
362
 
365
- const bool model_wants_add_bos = llama_should_add_bos_token(model);
363
+ const bool model_wants_add_bos = llama_add_bos_token(model);
366
364
  const bool add_bos = model_wants_add_bos && !no_bos;
367
365
  const bool parse_special = !no_parse_special;
368
366
 
369
367
  std::vector<llama_token> tokens;
370
- tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
368
+ tokens = common_tokenize(model, prompt, add_bos, parse_special);
371
369
 
372
370
  if (printing_ids) {
373
371
  printf("[");
@@ -382,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
382
380
  } else {
383
381
  bool invalid_utf8 = false;
384
382
  printf("%6d -> '", tokens[i]);
385
- write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
383
+ write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
386
384
  if (invalid_utf8) {
387
385
  printf("' (utf-8 decode failure)\n");
388
386
  } else {
@@ -56,6 +56,15 @@ else()
56
56
  set(GGML_NATIVE_DEFAULT ON)
57
57
  endif()
58
58
 
59
+ # defaults
60
+ if (NOT GGML_LLAMAFILE_DEFAULT)
61
+ set(GGML_LLAMAFILE_DEFAULT OFF)
62
+ endif()
63
+
64
+ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
65
+ set(GGML_CUDA_GRAPHS_DEFAULT OFF)
66
+ endif()
67
+
59
68
  # general
60
69
  option(GGML_STATIC "ggml: static link libraries" OFF)
61
70
  option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
@@ -83,6 +92,7 @@ else()
83
92
  endif()
84
93
 
85
94
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
95
+ option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
86
96
 
87
97
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
88
98
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
@@ -90,6 +100,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
90
100
  option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
91
101
  option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
92
102
  option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
103
+ option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
104
+ option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
105
+ option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
93
106
  option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
94
107
  if (NOT MSVC)
95
108
  option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -104,42 +117,40 @@ endif()
104
117
 
105
118
  # ggml core
106
119
  set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
120
+ option(GGML_CPU "ggml: enable CPU backend" ON)
107
121
 
108
122
  # 3rd party libs / backends
109
123
  option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
110
124
  option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
111
125
  set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
112
126
  "ggml: BLAS library vendor")
113
- option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
127
+ option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
114
128
 
115
129
  option(GGML_CUDA "ggml: use CUDA" OFF)
116
130
  option(GGML_MUSA "ggml: use MUSA" OFF)
117
- option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
118
131
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
119
132
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
120
- set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
121
- set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
122
133
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
123
- set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
124
- "ggml: iters./thread per block for Q2_K/Q6_K")
125
134
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
126
135
  "ggml: max. batch size for using peer access")
127
136
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
128
137
  option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
129
138
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
130
- option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
139
+ option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
131
140
 
132
- option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
133
- option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
141
+ option(GGML_HIP "ggml: use HIP" OFF)
134
142
  option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
135
143
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
136
144
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
137
145
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
138
146
  option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
147
+ option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
148
+ option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
139
149
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
140
150
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
141
151
  option(GGML_KOMPUTE "ggml: use Kompute" OFF)
142
152
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
153
+ option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
143
154
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
144
155
  option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
145
156
  option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@@ -148,6 +159,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
148
159
  set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
149
160
  option(GGML_OPENMP "ggml: use OpenMP" ON)
150
161
  option(GGML_RPC "ggml: use RPC" OFF)
162
+ option(GGML_AMX "ggml: use AMX" OFF)
151
163
  option(GGML_SYCL "ggml: use SYCL" OFF)
152
164
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
153
165
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
@@ -204,12 +216,14 @@ include(CMakePackageConfigHelpers)
204
216
  # all public headers
205
217
  set(GGML_PUBLIC_HEADERS
206
218
  include/ggml.h
219
+ include/ggml-cpu.h
207
220
  include/ggml-alloc.h
208
221
  include/ggml-backend.h
209
222
  include/ggml-blas.h
223
+ include/ggml-cann.h
210
224
  include/ggml-cuda.h
211
- include/ggml.h
212
225
  include/ggml-kompute.h
226
+ include/ggml-opt.h
213
227
  include/ggml-metal.h
214
228
  include/ggml-rpc.h
215
229
  include/ggml-sycl.h
@@ -222,12 +236,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
222
236
  install(TARGETS ggml PUBLIC_HEADER)
223
237
 
224
238
  if (BUILD_SHARED_LIBS)
225
- install(TARGETS ggml LIBRARY)
239
+ install(TARGETS ggml LIBRARY)
240
+ install(TARGETS ggml-base LIBRARY)
226
241
  endif()
227
242
 
243
+ # FIXME: this should be done in the backend cmake files
228
244
  if (GGML_METAL)
245
+ # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
229
246
  install(
230
- FILES src/ggml-metal.metal
247
+ FILES src/ggml-metal/ggml-metal.metal
231
248
  PERMISSIONS
232
249
  OWNER_READ
233
250
  OWNER_WRITE
@@ -7,8 +7,8 @@ extern "C" {
7
7
  #endif
8
8
 
9
9
  typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
10
- typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
- typedef struct ggml_backend * ggml_backend_t;
10
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
+ typedef struct ggml_backend * ggml_backend_t;
12
12
 
13
13
  // Tensor allocator
14
14
  struct ggml_tallocr {
@@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
24
24
  // Graph allocator
25
25
  /*
26
26
  Example usage:
27
- ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
27
+ ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
28
28
 
29
29
  // optional: create a worst-case graph and reserve the buffers to avoid reallocations
30
30
  ggml_gallocr_reserve(galloc, build_graph(max_batch));
@@ -0,0 +1,25 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ // buffer_type API
12
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
13
+
14
+ GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
15
+
16
+ // backend API
17
+ GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
18
+
19
+ GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
20
+
21
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
22
+
23
+ #ifdef __cplusplus
24
+ }
25
+ #endif
@@ -3,6 +3,20 @@
3
3
  #include "ggml.h"
4
4
  #include "ggml-alloc.h"
5
5
 
6
+ #ifdef GGML_BACKEND_SHARED
7
+ # if defined(_WIN32) && !defined(__MINGW32__)
8
+ # ifdef GGML_BACKEND_BUILD
9
+ # define GGML_BACKEND_API __declspec(dllexport) extern
10
+ # else
11
+ # define GGML_BACKEND_API __declspec(dllimport) extern
12
+ # endif
13
+ # else
14
+ # define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
15
+ # endif
16
+ #else
17
+ # define GGML_BACKEND_API extern
18
+ #endif
19
+
6
20
  #ifdef __cplusplus
7
21
  extern "C" {
8
22
  #endif
@@ -12,43 +26,52 @@ extern "C" {
12
26
  typedef struct ggml_backend_event * ggml_backend_event_t;
13
27
  typedef struct ggml_backend * ggml_backend_t;
14
28
  typedef void * ggml_backend_graph_plan_t;
29
+ typedef struct ggml_backend_reg * ggml_backend_reg_t;
30
+ typedef struct ggml_backend_device * ggml_backend_dev_t;
31
+
15
32
 
16
33
  //
17
- // Backend buffer
34
+ // Backend buffer type
18
35
  //
19
36
 
20
- // buffer type
21
- GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
22
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
23
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
- GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
- GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
26
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
37
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
38
+ GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
39
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
40
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
41
+ GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
42
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
43
+ GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
44
+
45
+ //
46
+ // Backend buffer
47
+ //
27
48
 
28
- // buffer
29
49
  enum ggml_backend_buffer_usage {
30
50
  GGML_BACKEND_BUFFER_USAGE_ANY = 0,
31
51
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
32
52
  GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
33
53
  };
34
54
 
35
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
36
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
37
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
38
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
39
- GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
41
- GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
42
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
43
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
44
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
45
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
46
- GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
47
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
48
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
55
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
56
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
57
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
58
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
59
+ GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
60
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
61
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
62
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
63
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
64
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
65
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
66
+ GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
67
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
68
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
69
+
70
+ // tensor copy between different backends
71
+ GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
49
72
 
50
73
  //
51
- // Backend
74
+ // Backend (stream)
52
75
  //
53
76
 
54
77
  GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
@@ -63,8 +86,10 @@ extern "C" {
63
86
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
64
87
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
65
88
 
66
- GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
67
- GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
89
+ // "offset" refers to the offset in tensor->data for setting/getting data
90
+ GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
91
+ GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
92
+ GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
68
93
 
69
94
  GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
70
95
 
@@ -74,64 +99,126 @@ extern "C" {
74
99
  GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
75
100
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
76
101
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
+
103
+ // NOTE: will be removed, use device version instead
77
104
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
105
  GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
79
106
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
80
107
 
81
- // tensor copy between different backends
82
- GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
83
-
84
108
  // asynchronous copy
85
109
  // the copy is performed after all the currently queued operations in backend_src
86
110
  // backend_dst will wait for the copy to complete before performing other operations
87
111
  // automatic fallback to sync copy if async is not supported
88
112
  GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
89
113
 
90
- // events
91
- GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
92
- GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
93
- GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
94
- GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
95
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
114
+ GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
96
115
 
97
116
  //
98
- // CPU backend
117
+ // Events
99
118
  //
100
119
 
101
- GGML_API ggml_backend_t ggml_backend_cpu_init(void);
120
+ GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
121
+ GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
122
+ GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
123
+ GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
124
+ GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
102
125
 
103
- GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
104
- GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
105
- GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
126
+ //
127
+ // Backend device
128
+ //
106
129
 
107
- // Create a backend buffer from an existing pointer
108
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
130
+ enum ggml_backend_dev_type {
131
+ // CPU device using system memory
132
+ GGML_BACKEND_DEVICE_TYPE_CPU,
133
+ // GPU device using dedicated memory
134
+ GGML_BACKEND_DEVICE_TYPE_GPU,
135
+ // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
+ GGML_BACKEND_DEVICE_TYPE_ACCEL
137
+ };
109
138
 
110
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
139
+ // functionality supported by the device
140
+ struct ggml_backend_dev_caps {
141
+ // asynchronous operations
142
+ bool async;
143
+ // pinned host buffer
144
+ bool host_buffer;
145
+ // creating buffers from host ptr
146
+ bool buffer_from_host_ptr;
147
+ // event synchronization
148
+ bool events;
149
+ };
111
150
 
112
- #ifdef GGML_USE_CPU_HBM
113
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
114
- #endif
151
+ // all the device properties
152
+ struct ggml_backend_dev_props {
153
+ const char * name;
154
+ const char * description;
155
+ size_t memory_free;
156
+ size_t memory_total;
157
+ enum ggml_backend_dev_type type;
158
+ struct ggml_backend_dev_caps caps;
159
+ };
160
+
161
+ GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
162
+ GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
163
+ GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
164
+ GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
165
+ GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
166
+ GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
167
+ GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
168
+ GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
169
+ GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
170
+ GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
171
+
172
+ GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
173
+ GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
174
+ GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
115
175
 
116
176
  //
117
- // Backend registry
177
+ // Backend (reg)
118
178
  //
119
179
 
120
- // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
180
+ GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
181
+ GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
182
+ GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
183
+ GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
184
+
185
+ // Common functions that may be obtained using ggml_backend_reg_get_proc_address
186
+
187
+ // Split buffer type for tensor parallelism
188
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
189
+ // Set the number of threads for the backend
190
+ typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191
+ // Get additional buffer types provided by the device (returns a NULL-terminated array)
192
+ typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
193
+
194
+ //
195
+ // Backend registry
196
+ //
121
197
 
122
- GGML_API size_t ggml_backend_reg_get_count(void);
123
- GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
124
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
125
- GGML_API const char * ggml_backend_reg_get_name(size_t i);
126
- GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
127
- GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
128
- GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
198
+ // Backend (reg) enumeration
199
+ GGML_API size_t ggml_backend_reg_count(void);
200
+ GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
201
+ GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
202
+
203
+ // Device enumeration
204
+ GGML_API size_t ggml_backend_dev_count(void);
205
+ GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
206
+ GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
207
+ GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
208
+
209
+ // Direct backend (stream) initialization
210
+ // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
211
+ GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
212
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
213
+ GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
214
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
215
+ GGML_API ggml_backend_t ggml_backend_init_best(void);
129
216
 
130
217
  //
131
218
  // Backend scheduler
132
219
  //
133
220
 
134
- // The backend scheduler allows for multiple backends to be used together
221
+ // The backend scheduler allows for multiple backend devices to be used together
135
222
  // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
136
223
  // The backends are selected based on:
137
224
  // - the backend that supports the operation
@@ -155,20 +242,26 @@ extern "C" {
155
242
  ggml_backend_sched_reserve(sched, reserve_graph);
156
243
 
157
244
  // compute
158
- graph = build_graph(sched);
159
- ggml_backend_sched_graph_compute(sched, graph);
245
+ graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
246
+ for (int i = 0; i < 10; ++i) {
247
+ ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
248
+ }
160
249
 
161
250
  // if there are graph inputs:
162
- ggml_backend_sched_reset(sched);
163
- ggml_backend_sched_alloc_graph(sched, graph);
164
- ggml_backend_tensor_set(input_tensor, ...);
165
- ggml_backend_sched_graph_compute(sched, graph);
251
+ graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
252
+ ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
253
+ ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
254
+ ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
255
+ ggml_backend_sched_graph_compute(sched, graph); // execute the graph
256
+
257
+ // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
258
+ // allocate them statically via ggml_backend_alloc_ctx_tensors
166
259
  }
167
260
  */
168
261
 
169
- struct ggml_backend_sched;
170
262
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
171
263
 
264
+ // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
172
265
  // when ask == true, the scheduler wants to know if the user wants to observe this node
173
266
  // this allows the scheduler to batch nodes together in order to evaluate them in a single call
174
267
  //
@@ -177,12 +270,12 @@ extern "C" {
177
270
  //
178
271
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
179
272
 
180
- // Initialize a backend scheduler
273
+ // Initialize a backend scheduler, backends with low index are given priority over backends with high index
181
274
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
182
275
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
183
276
 
184
277
  // Initialize backend buffers from a measure graph
185
- GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
278
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
186
279
 
187
280
  GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
188
281
  GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
@@ -197,12 +290,14 @@ extern "C" {
197
290
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
198
291
 
199
292
  // Allocate and compute graph on the backend scheduler
200
- GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
293
+ GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
201
294
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
202
295
  GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
203
296
  GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
204
297
 
205
- // Reset all assignments and allocators - must be called before changing the node backends
298
+ // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
299
+ // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
300
+ // The correct way to use this API is to discard the deallocated tensors and create new ones.
206
301
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
207
302
 
208
303
  // Set a callback to be called for each resulting node during graph compute
@@ -223,7 +318,7 @@ extern "C" {
223
318
  GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
224
319
  GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
225
320
 
226
- typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
321
+ typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
227
322
 
228
323
  // Compare the output of two backends
229
324
  GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
@@ -232,6 +327,9 @@ extern "C" {
232
327
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
233
328
  GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
234
329
 
330
+ // CPU buffer types are always available
331
+ GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
332
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
235
333
 
236
334
  #ifdef __cplusplus
237
335
  }