@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -0,0 +1,17 @@
1
+ #pragma once
2
+ #include "common.h"
3
+ #include <stdint.h>
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+
9
+ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
10
+
11
+ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
12
+
13
+ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
14
+
15
+ #ifdef __cplusplus
16
+ }
17
+ #endif
@@ -9,144 +9,207 @@ extern "C" {
9
9
  #endif
10
10
 
11
11
  //
12
- // Backend buffer
12
+ // Backend buffer type
13
13
  //
14
14
 
15
- // buffer type
16
- typedef void * ggml_backend_buffer_type_context_t;
17
-
18
15
  struct ggml_backend_buffer_type_i {
19
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
16
+ const char * (*get_name) (ggml_backend_buffer_type_t buft);
20
17
  // allocate a buffer of this type
21
- ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
18
+ ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
22
19
  // tensor alignment
23
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
- // max buffer size that can be allocated
25
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
- // data size needed to allocate the tensor, including padding
27
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
28
- // check if tensor data is in host memory
29
- bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
20
+ size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
21
+ // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
22
+ size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
23
+ // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
24
+ size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
+ // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
26
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
30
27
  };
31
28
 
32
29
  struct ggml_backend_buffer_type {
33
30
  struct ggml_backend_buffer_type_i iface;
34
- ggml_backend_buffer_type_context_t context;
31
+ ggml_backend_dev_t device;
32
+ void * context;
35
33
  };
36
34
 
37
- // buffer
38
- typedef void * ggml_backend_buffer_context_t;
35
+ //
36
+ // Backend buffer
37
+ //
39
38
 
40
39
  struct ggml_backend_buffer_i {
41
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
42
- void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
43
- void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
44
- void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
45
- void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
46
- void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
47
- bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
48
- void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
49
- void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
40
+ // (optional) free the buffer
41
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
42
+ // base address of the buffer
43
+ void * (*get_base) (ggml_backend_buffer_t buffer);
44
+ // (optional) initialize a tensor in the buffer (eg. add tensor extras)
45
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
46
+ // tensor data access
47
+ void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
48
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
49
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
50
+ // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
51
+ bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
52
+ // clear the entire buffer
53
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
54
+ // (optional) reset any internal state due to tensor initialization, such as tensor extras
55
+ void (*reset) (ggml_backend_buffer_t buffer);
50
56
  };
51
57
 
52
58
  struct ggml_backend_buffer {
53
59
  struct ggml_backend_buffer_i iface;
54
60
  ggml_backend_buffer_type_t buft;
55
- ggml_backend_buffer_context_t context;
61
+ void * context;
56
62
  size_t size;
57
63
  enum ggml_backend_buffer_usage usage;
58
64
  };
59
65
 
60
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
61
- ggml_backend_buffer_type_t buft,
62
- struct ggml_backend_buffer_i iface,
63
- ggml_backend_buffer_context_t context,
64
- size_t size);
66
+ ggml_backend_buffer_t ggml_backend_buffer_init(
67
+ ggml_backend_buffer_type_t buft,
68
+ struct ggml_backend_buffer_i iface,
69
+ void * context,
70
+ size_t size);
65
71
 
66
72
  // do not use directly, use ggml_backend_tensor_copy instead
67
73
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
68
74
 
75
+ // multi-buffer
69
76
  // buffer that contains a collection of buffers
70
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
71
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
72
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
77
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
78
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
79
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
73
80
 
74
81
  //
75
- // Backend
82
+ // Backend (stream)
76
83
  //
77
84
 
78
- typedef void * ggml_backend_context_t;
79
-
80
85
  struct ggml_backend_i {
81
- const char * (*GGML_CALL get_name)(ggml_backend_t backend);
82
-
83
- void (*GGML_CALL free)(ggml_backend_t backend);
86
+ const char * (*get_name)(ggml_backend_t backend);
84
87
 
85
- // buffer allocation
86
- ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
88
+ void (*free)(ggml_backend_t backend);
87
89
 
88
90
  // (optional) asynchronous tensor data access
89
- void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
90
- void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
91
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
91
+ void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
92
+ void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
93
+ bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
92
94
 
93
- // (optional) complete all pending operations
94
- void (*GGML_CALL synchronize)(ggml_backend_t backend);
95
+ // (optional) complete all pending operations (required if the backend supports async operations)
96
+ void (*synchronize)(ggml_backend_t backend);
95
97
 
96
- // compute graph with a plan (not used currently)
97
- // create a new plan for a graph
98
- ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
99
- void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
98
+ // (optional) graph plans (not used currently)
99
+ // compute graph with a plan
100
+ ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
101
+ void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
102
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101
- void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
103
+ void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102
104
  // compute the graph with the plan
103
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
105
+ enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
104
106
 
105
- // compute graph without a plan (async)
106
- enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
107
-
108
- // check if the backend can compute an operation
109
- bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
-
111
- // check if the backend can use tensors allocated in a buffer type
112
- bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113
-
114
- // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
115
- // these should be expensive operations with large batch sizes that may benefit from running on this backend
116
- // even if the weight has to be copied from the CPU temporarily
117
- bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
107
+ // compute graph (always async if supported by the backend)
108
+ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
118
109
 
119
110
  // (optional) event synchronization
120
- // create a new event that can record events on this backend instance
121
- ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
122
- void (*GGML_CALL event_free) (ggml_backend_event_t event);
123
- // record an event on the backend instance that created it
124
- void (*GGML_CALL event_record) (ggml_backend_event_t event);
125
- // wait for an event on on a different backend instance
126
- void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127
- // block until an event is recorded
128
- void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
111
+ // record an event on this stream
112
+ void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
113
+ // wait for an event on on a different stream
114
+ void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
129
115
  };
130
116
 
131
117
  struct ggml_backend {
132
118
  ggml_guid_t guid;
133
-
134
119
  struct ggml_backend_i iface;
135
- ggml_backend_context_t context;
120
+ ggml_backend_dev_t device;
121
+ void * context;
136
122
  };
137
123
 
138
124
  struct ggml_backend_event {
139
- ggml_backend_t backend;
125
+ struct ggml_backend_device * device;
126
+ void * context;
127
+ };
128
+
129
+ //
130
+ // Backend device
131
+ //
132
+
133
+ // Note: if additional properties are needed, we should add a struct with all of them
134
+ // the current functions to obtain the properties can remain, since they are more convenient for often used properties
135
+ struct ggml_backend_device_i {
136
+ // device name: short identifier for this device, such as "CPU" or "CUDA0"
137
+ const char * (*get_name)(ggml_backend_dev_t dev);
138
+
139
+ // device description: short informative description of the device, could be the model name
140
+ const char * (*get_description)(ggml_backend_dev_t dev);
141
+
142
+ // device memory in bytes
143
+ void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
144
+
145
+ // device type
146
+ enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
147
+
148
+ // device properties
149
+ void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
150
+
151
+ // backend (stream) initialization
152
+ ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
153
+
154
+ // preferred buffer type
155
+ ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
156
+
157
+ // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
158
+ ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
159
+
160
+ // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
161
+ ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
162
+
163
+ // check if the backend can compute an operation
164
+ bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
165
+
166
+ // check if the backend can use tensors allocated in a buffer type
167
+ bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
168
+
169
+ // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
170
+ // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
171
+ bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
172
+
173
+ // (optional) event synchronization
174
+ ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
175
+ void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
176
+ void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
177
+ };
178
+
179
+ struct ggml_backend_device {
180
+ struct ggml_backend_device_i iface;
181
+ ggml_backend_reg_t reg;
140
182
  void * context;
141
183
  };
142
184
 
143
185
  //
144
- // Backend registry
186
+ // Backend (reg)
145
187
  //
146
188
 
147
- typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
189
+ struct ggml_backend_reg_i {
190
+ const char * (*get_name)(ggml_backend_reg_t reg);
191
+
192
+ // enumerate available devices
193
+ size_t (*get_device_count)(ggml_backend_reg_t reg);
194
+ ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
195
+
196
+ // (optional) get a pointer to a function in the backend
197
+ // backends can add custom functions that are not part of the standard ggml-backend interface
198
+ void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
199
+ };
200
+
201
+ struct ggml_backend_reg {
202
+ // int api_version; // TODO: for dynamic loading
203
+ struct ggml_backend_reg_i iface;
204
+ void * context;
205
+ };
206
+
148
207
 
149
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
208
+ // Internal backend registry API
209
+ void ggml_backend_register(ggml_backend_reg_t reg);
210
+ void ggml_backend_device_register(ggml_backend_dev_t device);
211
+ // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
212
+ // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
150
213
 
151
214
  #ifdef __cplusplus
152
215
  }
@@ -0,0 +1,195 @@
1
+ #include "ggml-backend-impl.h"
2
+ #include "ggml-backend.h"
3
+ #include "ggml-cpu.h"
4
+ #include "ggml-impl.h"
5
+ #include <cstring>
6
+ #include <vector>
7
+
8
+ // Backend registry
9
+
10
+ #ifdef GGML_USE_CUDA
11
+ #include "ggml-cuda.h"
12
+ #endif
13
+
14
+ #ifdef GGML_USE_METAL
15
+ #include "ggml-metal.h"
16
+ #endif
17
+
18
+ #ifdef GGML_USE_SYCL
19
+ #include "ggml-sycl.h"
20
+ #endif
21
+
22
+ #ifdef GGML_USE_VULKAN
23
+ #include "ggml-vulkan.h"
24
+ #endif
25
+
26
+ #ifdef GGML_USE_BLAS
27
+ #include "ggml-blas.h"
28
+ #endif
29
+
30
+ #ifdef GGML_USE_RPC
31
+ #include "ggml-rpc.h"
32
+ #endif
33
+
34
+ #ifdef GGML_USE_AMX
35
+ # include "ggml-amx.h"
36
+ #endif
37
+
38
+ #ifdef GGML_USE_CANN
39
+ #include "ggml-cann.h"
40
+ #endif
41
+
42
+ #ifdef GGML_USE_KOMPUTE
43
+ #include "ggml-kompute.h"
44
+ #endif
45
+
46
+ struct ggml_backend_registry {
47
+ std::vector<ggml_backend_reg_t> backends;
48
+ std::vector<ggml_backend_dev_t> devices;
49
+
50
+ ggml_backend_registry() {
51
+ #ifdef GGML_USE_CUDA
52
+ register_backend(ggml_backend_cuda_reg());
53
+ #endif
54
+ #ifdef GGML_USE_METAL
55
+ register_backend(ggml_backend_metal_reg());
56
+ #endif
57
+ #ifdef GGML_USE_SYCL
58
+ register_backend(ggml_backend_sycl_reg());
59
+ #endif
60
+ #ifdef GGML_USE_VULKAN
61
+ register_backend(ggml_backend_vk_reg());
62
+ #endif
63
+ #ifdef GGML_USE_CANN
64
+ register_backend(ggml_backend_cann_reg());
65
+ #endif
66
+ #ifdef GGML_USE_BLAS
67
+ register_backend(ggml_backend_blas_reg());
68
+ #endif
69
+ #ifdef GGML_USE_RPC
70
+ register_backend(ggml_backend_rpc_reg());
71
+ #endif
72
+ #ifdef GGML_USE_AMX
73
+ register_backend(ggml_backend_amx_reg());
74
+ #endif
75
+ #ifdef GGML_USE_KOMPUTE
76
+ register_backend(ggml_backend_kompute_reg());
77
+ #endif
78
+
79
+ register_backend(ggml_backend_cpu_reg());
80
+ }
81
+
82
+ void register_backend(ggml_backend_reg_t reg) {
83
+ if (!reg) {
84
+ return;
85
+ }
86
+
87
+ #ifndef NDEBUG
88
+ GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
89
+ __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
90
+ #endif
91
+ backends.push_back(reg);
92
+ for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
93
+ register_device(ggml_backend_reg_dev_get(reg, i));
94
+ }
95
+ }
96
+
97
+ void register_device(ggml_backend_dev_t device) {
98
+ #ifndef NDEBUG
99
+ GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
100
+ #endif
101
+ devices.push_back(device);
102
+ }
103
+ };
104
+
105
+ static ggml_backend_registry & get_reg() {
106
+ static ggml_backend_registry reg;
107
+ return reg;
108
+ }
109
+
110
+ // Internal API
111
+ void ggml_backend_register(ggml_backend_reg_t reg) {
112
+ get_reg().register_backend(reg);
113
+ }
114
+
115
+ void ggml_backend_device_register(ggml_backend_dev_t device) {
116
+ get_reg().register_device(device);
117
+ }
118
+
119
+ // Backend (reg) enumeration
120
+ size_t ggml_backend_reg_count() {
121
+ return get_reg().backends.size();
122
+ }
123
+
124
+ ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
125
+ GGML_ASSERT(index < ggml_backend_reg_count());
126
+ return get_reg().backends[index];
127
+ }
128
+
129
+ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
130
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
131
+ ggml_backend_reg_t reg = ggml_backend_reg_get(i);
132
+ if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
133
+ return reg;
134
+ }
135
+ }
136
+ return NULL;
137
+ }
138
+
139
+ // Device enumeration
140
+ size_t ggml_backend_dev_count() {
141
+ return get_reg().devices.size();
142
+ }
143
+
144
+ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
145
+ GGML_ASSERT(index < ggml_backend_dev_count());
146
+ return get_reg().devices[index];
147
+ }
148
+
149
+ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
150
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
151
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
152
+ if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
153
+ return dev;
154
+ }
155
+ }
156
+ return NULL;
157
+ }
158
+
159
+ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
160
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
161
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
162
+ if (ggml_backend_dev_type(dev) == type) {
163
+ return dev;
164
+ }
165
+ }
166
+ return NULL;
167
+ }
168
+
169
+ // Convenience functions
170
+ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
171
+ ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
172
+ if (!dev) {
173
+ return NULL;
174
+ }
175
+ return ggml_backend_dev_init(dev, params);
176
+ }
177
+
178
+ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
179
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
180
+ if (!dev) {
181
+ return NULL;
182
+ }
183
+ return ggml_backend_dev_init(dev, params);
184
+ }
185
+
186
+ ggml_backend_t ggml_backend_init_best(void) {
187
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
188
+ if (!dev) {
189
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
190
+ }
191
+ if (!dev) {
192
+ return NULL;
193
+ }
194
+ return ggml_backend_dev_init(dev, NULL);
195
+ }