@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,3 +1,14 @@
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
11
+ #include "ggml-backend.h"
1
12
  #include "ggml-backend-impl.h"
2
13
  #include "ggml-alloc.h"
3
14
  #include "ggml-impl.h"
@@ -8,9 +19,14 @@
8
19
  #include <stdio.h>
9
20
  #include <stdlib.h>
10
21
  #include <string.h>
22
+ #include <string>
23
+ #include <vector>
11
24
 
25
+ #ifdef __APPLE__
26
+ #include <sys/types.h>
27
+ #include <sys/sysctl.h>
28
+ #endif
12
29
 
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
30
 
15
31
  // backend buffer type
16
32
 
@@ -18,7 +34,12 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
18
34
  return buft->iface.get_name(buft);
19
35
  }
20
36
 
21
- GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
37
+ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
22
43
  return buft->iface.alloc_buffer(buft, size);
23
44
  }
24
45
 
@@ -34,7 +55,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
34
55
  return SIZE_MAX;
35
56
  }
36
57
 
37
- GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
58
+ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
38
59
  // get_alloc_size is optional, defaults to ggml_nbytes
39
60
  if (buft->iface.get_alloc_size) {
40
61
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -51,16 +72,18 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
51
72
  return false;
52
73
  }
53
74
 
54
- // backend buffer
75
+ ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
76
+ return buft->device;
77
+ }
55
78
 
56
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
57
- ggml_backend_buffer_type_t buft,
58
- struct ggml_backend_buffer_i iface,
59
- ggml_backend_buffer_context_t context,
60
- size_t size) {
61
- ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
79
+ // backend buffer
62
80
 
63
- (*buffer) = (struct ggml_backend_buffer) {
81
+ ggml_backend_buffer_t ggml_backend_buffer_init(
82
+ ggml_backend_buffer_type_t buft,
83
+ struct ggml_backend_buffer_i iface,
84
+ void * context,
85
+ size_t size) {
86
+ ggml_backend_buffer_t buffer = new ggml_backend_buffer {
64
87
  /* .interface = */ iface,
65
88
  /* .buft = */ buft,
66
89
  /* .context = */ context,
@@ -72,7 +95,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
72
95
  }
73
96
 
74
97
  const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
75
- return buffer->iface.get_name(buffer);
98
+ return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
76
99
  }
77
100
 
78
101
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -83,7 +106,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
83
106
  if (buffer->iface.free_buffer != NULL) {
84
107
  buffer->iface.free_buffer(buffer);
85
108
  }
86
- free(buffer);
109
+ delete buffer;
87
110
  }
88
111
 
89
112
  size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
@@ -91,6 +114,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
91
114
  }
92
115
 
93
116
  void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
94
122
  void * base = buffer->iface.get_base(buffer);
95
123
 
96
124
  GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -98,14 +126,23 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
98
126
  return base;
99
127
  }
100
128
 
101
- GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
129
+ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
102
130
  // init_tensor is optional
103
131
  if (buffer->iface.init_tensor) {
104
132
  buffer->iface.init_tensor(buffer, tensor);
105
133
  }
106
134
  }
107
135
 
108
- size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
136
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
145
+ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
109
146
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
110
147
  }
111
148
 
@@ -117,10 +154,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
117
154
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
118
155
  }
119
156
 
120
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
121
- buffer->iface.clear(buffer, value);
122
- }
123
-
124
157
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
125
158
  return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
126
159
  }
@@ -181,7 +214,7 @@ void ggml_backend_free(ggml_backend_t backend) {
181
214
  }
182
215
 
183
216
  ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
184
- return backend->iface.get_default_buffer_type(backend);
217
+ return ggml_backend_dev_buffer_type(backend->device);
185
218
  }
186
219
 
187
220
  ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@@ -218,32 +251,47 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
218
251
  }
219
252
  }
220
253
 
221
- GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
254
+ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
255
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
256
 
257
+ if (size == 0) {
258
+ return;
259
+ }
260
+
224
261
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
262
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
263
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
264
 
228
- if (!size) {
229
- return;
230
- }
231
-
232
265
  buf->iface.set_tensor(buf, tensor, data, offset, size);
233
266
  }
234
267
 
235
- GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
268
+ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
269
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
270
 
271
+ if (size == 0) {
272
+ return;
273
+ }
274
+
238
275
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
276
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
240
277
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
278
 
242
- if (!size) {
279
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
280
+ }
281
+
282
+ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
283
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
284
+
285
+ if (size == 0) {
243
286
  return;
244
287
  }
245
288
 
246
- buf->iface.get_tensor(buf, tensor, data, offset, size);
289
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
290
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
291
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
292
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
293
+
294
+ buf->iface.memset_tensor(buf, tensor, value, offset, size);
247
295
  }
248
296
 
249
297
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -283,18 +331,19 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
283
331
  }
284
332
 
285
333
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
286
- return backend->iface.supports_op(backend, op);
334
+ return ggml_backend_dev_supports_op(backend->device, op);
287
335
  }
288
336
 
289
337
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
290
- return backend->iface.supports_buft(backend, buft);
338
+ return ggml_backend_dev_supports_buft(backend->device, buft);
291
339
  }
292
340
 
293
341
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
294
- if (backend->iface.offload_op != NULL) {
295
- return backend->iface.offload_op(backend, op);
296
- }
297
- return false;
342
+ return ggml_backend_dev_offload_op(backend->device, op);
343
+ }
344
+
345
+ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
346
+ return backend->device;
298
347
  }
299
348
 
300
349
  // backend copy
@@ -327,7 +376,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
327
376
  ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
328
377
  } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
329
378
  #ifndef NDEBUG
330
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
379
+ GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
331
380
  #endif
332
381
  size_t nbytes = ggml_nbytes(src);
333
382
  void * data = malloc(nbytes);
@@ -351,43 +400,39 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
351
400
  }
352
401
 
353
402
  // an async copy would normally happen after all the queued operations on both backends are completed
354
- // sync src, set_async dst
355
- if (ggml_backend_buffer_is_host(src->buffer)) {
356
- ggml_backend_synchronize(backend_src);
357
- ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
358
- } else {
359
- ggml_backend_synchronize(backend_src);
360
- ggml_backend_tensor_copy(src, dst);
361
- ggml_backend_synchronize(backend_dst);
362
- }
403
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
404
+ ggml_backend_synchronize(backend_src);
405
+ ggml_backend_synchronize(backend_dst);
406
+ ggml_backend_tensor_copy(src, dst);
363
407
  }
364
408
 
365
409
  // events
366
410
 
367
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
368
- if (backend->iface.event_new == NULL) {
411
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
412
+ // null device is allowed for the transition period to the device interface
413
+ if (device == NULL || device->iface.event_new == NULL) {
369
414
  return NULL;
370
415
  }
371
- return backend->iface.event_new(backend);
416
+ return device->iface.event_new(device);
372
417
  }
373
418
 
374
419
  void ggml_backend_event_free(ggml_backend_event_t event) {
375
420
  if (event == NULL) {
376
421
  return;
377
422
  }
378
- event->backend->iface.event_free(event);
423
+ event->device->iface.event_free(event->device, event);
379
424
  }
380
425
 
381
- void ggml_backend_event_record(ggml_backend_event_t event) {
382
- GGML_ASSERT(event->backend->iface.event_record != NULL);
426
+ void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
427
+ GGML_ASSERT(backend->iface.event_record != NULL);
383
428
 
384
- event->backend->iface.event_record(event);
429
+ backend->iface.event_record(backend, event);
385
430
  }
386
431
 
387
432
  void ggml_backend_event_synchronize(ggml_backend_event_t event) {
388
- GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
433
+ GGML_ASSERT(event->device->iface.event_synchronize);
389
434
 
390
- event->backend->iface.event_synchronize(event);
435
+ event->device->iface.event_synchronize(event->device, event);
391
436
  }
392
437
 
393
438
  void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
@@ -396,536 +441,88 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
396
441
  backend->iface.event_wait(backend, event);
397
442
  }
398
443
 
399
- // backend registry
400
-
401
- #define GGML_REG_MAX_BACKENDS 64
402
-
403
- struct ggml_backend_reg {
404
- char name[128];
405
- ggml_backend_init_fn init_fn;
406
- ggml_backend_buffer_type_t default_buffer_type;
407
- void * user_data;
408
- };
409
-
410
- static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
411
- static size_t ggml_backend_registry_count = 0;
412
-
413
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
414
-
415
- GGML_CALL static void ggml_backend_registry_init(void) {
416
- static bool initialized = false;
417
-
418
- if (initialized) {
419
- return;
420
- }
421
-
422
- initialized = true;
423
-
424
- ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
425
-
426
- // add forward decls here to avoid including the backend headers
427
- #ifdef GGML_USE_CUDA
428
- extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
429
- ggml_backend_cuda_reg_devices();
430
- #endif
431
-
432
- #ifdef GGML_USE_SYCL
433
- extern void ggml_backend_sycl_reg_devices(void);
434
- ggml_backend_sycl_reg_devices();
435
- #endif
436
-
437
- #ifdef GGML_USE_METAL
438
- extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
439
- extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
440
- ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
441
- #endif
442
-
443
- #ifdef GGML_USE_VULKAN
444
- extern GGML_CALL int ggml_backend_vk_reg_devices(void);
445
- ggml_backend_vk_reg_devices();
446
- #endif
447
-
448
- #ifdef GGML_USE_KOMPUTE
449
- extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
450
- ggml_backend_kompute_reg_devices();
451
- #endif
452
-
453
- #ifdef GGML_USE_CANN
454
- extern GGML_CALL int ggml_backend_cann_reg_devices(void);
455
- ggml_backend_cann_reg_devices();
456
- #endif
457
- }
458
-
459
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
460
- GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
461
-
462
- size_t id = ggml_backend_registry_count;
463
-
464
- ggml_backend_registry[id] = (struct ggml_backend_reg) {
465
- /* .name = */ {0},
466
- /* .fn = */ init_fn,
467
- /* .default_buffer_type = */ default_buffer_type,
468
- /* .user_data = */ user_data,
469
- };
470
-
471
- snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
472
-
473
- #ifndef NDEBUG
474
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
475
- #endif
476
-
477
- ggml_backend_registry_count++;
478
- }
479
-
480
- size_t ggml_backend_reg_get_count(void) {
481
- ggml_backend_registry_init();
444
+ // Backend device
482
445
 
483
- return ggml_backend_registry_count;
446
+ const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
447
+ return device->iface.get_name(device);
484
448
  }
485
449
 
486
- size_t ggml_backend_reg_find_by_name(const char * name) {
487
- ggml_backend_registry_init();
488
-
489
- for (size_t i = 0; i < ggml_backend_registry_count; i++) {
490
- // TODO: case insensitive in a portable way
491
- if (strcmp(ggml_backend_registry[i].name, name) == 0) {
492
- return i;
493
- }
494
- }
495
-
496
- // not found
497
- return SIZE_MAX;
450
+ const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
451
+ return device->iface.get_description(device);
498
452
  }
499
453
 
500
- // init from backend:params string
501
- ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
502
- ggml_backend_registry_init();
503
-
504
- const char * params = strchr(backend_str, ':');
505
- char backend_name[128];
506
- if (params == NULL) {
507
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
508
- params = "";
509
- } else {
510
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
511
- params++;
512
- }
513
-
514
- size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
515
-
516
- if (backend_i == SIZE_MAX) {
517
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
518
- return NULL;
519
- }
520
-
521
- return ggml_backend_reg_init_backend(backend_i, params);
454
+ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
455
+ device->iface.get_memory(device, free, total);
522
456
  }
523
457
 
524
- const char * ggml_backend_reg_get_name(size_t i) {
525
- ggml_backend_registry_init();
526
-
527
- GGML_ASSERT(i < ggml_backend_registry_count);
528
- return ggml_backend_registry[i].name;
458
+ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
459
+ return device->iface.get_type(device);
529
460
  }
530
461
 
531
- ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
532
- ggml_backend_registry_init();
533
-
534
- GGML_ASSERT(i < ggml_backend_registry_count);
535
- return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
462
+ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
463
+ memset(props, 0, sizeof(*props));
464
+ device->iface.get_props(device, props);
536
465
  }
537
466
 
538
- ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
539
- ggml_backend_registry_init();
540
-
541
- GGML_ASSERT(i < ggml_backend_registry_count);
542
- return ggml_backend_registry[i].default_buffer_type;
543
- }
544
-
545
- ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
546
- ggml_backend_registry_init();
547
-
548
- GGML_ASSERT(i < ggml_backend_registry_count);
549
- return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
550
- }
551
-
552
- // backend CPU
553
-
554
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
555
-
556
- GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
557
- return "CPU";
558
-
559
- GGML_UNUSED(buffer);
560
- }
561
-
562
- GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
563
- uintptr_t data = (uintptr_t)buffer->context;
564
-
565
- // align the buffer
566
- if (data % TENSOR_ALIGNMENT != 0) {
567
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
568
- }
569
-
570
- return (void *)data;
571
- }
572
-
573
- GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
574
- free(buffer->context);
575
- }
576
-
577
- GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
578
- memcpy((char *)tensor->data + offset, data, size);
579
-
580
- GGML_UNUSED(buffer);
581
- }
582
-
583
- GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
584
- memcpy(data, (const char *)tensor->data + offset, size);
585
-
586
- GGML_UNUSED(buffer);
587
- }
588
-
589
- GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
590
- if (ggml_backend_buffer_is_host(src->buffer)) {
591
- memcpy(dst->data, src->data, ggml_nbytes(src));
592
- return true;
593
- }
594
- return false;
595
-
596
- GGML_UNUSED(buffer);
597
- }
598
-
599
- GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
600
- memset(buffer->context, value, buffer->size);
601
- }
602
-
603
- static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
604
- /* .get_name = */ ggml_backend_cpu_buffer_name,
605
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
606
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
607
- /* .init_tensor = */ NULL, // no initialization required
608
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
609
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
610
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
611
- /* .clear = */ ggml_backend_cpu_buffer_clear,
612
- /* .reset = */ NULL,
613
- };
614
-
615
- // for buffers from ptr, free is not called
616
- static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
617
- /* .get_name = */ ggml_backend_cpu_buffer_name,
618
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
619
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
620
- /* .init_tensor = */ NULL, // no initialization required
621
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
622
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
623
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
624
- /* .clear = */ ggml_backend_cpu_buffer_clear,
625
- /* .reset = */ NULL,
626
- };
627
-
628
- GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
629
- return "CPU";
630
-
631
- GGML_UNUSED(buft);
632
- }
633
-
634
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
635
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
636
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
637
- if (data == NULL) {
638
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
639
- return NULL;
640
- }
641
-
642
- return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
643
- }
644
-
645
- GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
646
- return TENSOR_ALIGNMENT;
647
-
648
- GGML_UNUSED(buft);
649
- }
650
-
651
- GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
652
- return true;
653
-
654
- GGML_UNUSED(buft);
655
- }
656
-
657
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
658
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
659
- /* .iface = */ {
660
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
661
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
662
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
663
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
664
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
665
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
666
- },
667
- /* .context = */ NULL,
668
- };
669
-
670
- return &ggml_backend_cpu_buffer_type;
671
- }
672
-
673
- #ifdef GGML_USE_CPU_HBM
674
-
675
- // buffer type HBM
676
-
677
- #include <hbwmalloc.h>
678
-
679
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
680
- return "CPU_HBM";
681
-
682
- GGML_UNUSED(buft);
467
+ ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
468
+ return device->reg;
683
469
  }
684
470
 
685
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
686
- return "CPU_HBM";
687
-
688
- GGML_UNUSED(buf);
471
+ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
472
+ return device->iface.init_backend(device, params);
689
473
  }
690
474
 
691
- GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
692
- hbw_free(buffer->context);
475
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
476
+ return device->iface.get_buffer_type(device);
693
477
  }
694
478
 
695
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
696
- //void * ptr = hbw_malloc(size);
697
- void * ptr;
698
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
699
- if (result != 0) {
700
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
479
+ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
480
+ if (device->iface.get_host_buffer_type == NULL) {
701
481
  return NULL;
702
482
  }
703
483
 
704
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
705
- buffer->buft = buft;
706
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
707
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
708
-
709
- return buffer;
710
- }
711
-
712
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
713
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
714
- /* .iface = */ {
715
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
716
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
717
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
718
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
719
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
720
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
721
- },
722
- /* .context = */ NULL,
723
- };
724
-
725
- return &ggml_backend_cpu_buffer_type_hbm;
484
+ return device->iface.get_host_buffer_type(device);
726
485
  }
727
- #endif
728
-
729
- struct ggml_backend_cpu_context {
730
- int n_threads;
731
- void * work_data;
732
- size_t work_size;
733
-
734
- ggml_abort_callback abort_callback;
735
- void * abort_callback_data;
736
- };
737
-
738
- GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
739
- return "CPU";
740
486
 
741
- GGML_UNUSED(backend);
487
+ ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
488
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
742
489
  }
743
490
 
744
- GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
745
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
746
- free(cpu_ctx->work_data);
747
- free(cpu_ctx);
748
- free(backend);
491
+ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
492
+ return device->iface.supports_op(device, op);
749
493
  }
750
494
 
751
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
752
- return ggml_backend_cpu_buffer_type();
753
-
754
- GGML_UNUSED(backend);
495
+ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
496
+ return device->iface.supports_buft(device, buft);
755
497
  }
756
498
 
757
- struct ggml_backend_plan_cpu {
758
- struct ggml_cplan cplan;
759
- struct ggml_cgraph cgraph;
760
- };
761
-
762
- GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
763
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
764
-
765
- struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
766
-
767
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
768
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
769
-
770
- if (cpu_plan->cplan.work_size > 0) {
771
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
772
- if (cpu_plan->cplan.work_data == NULL) {
773
- free(cpu_plan);
774
- return NULL;
775
- }
499
+ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
500
+ if (device->iface.offload_op != NULL) {
501
+ return device->iface.offload_op(device, op);
776
502
  }
777
503
 
778
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
779
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
780
-
781
- return cpu_plan;
504
+ return false;
782
505
  }
783
506
 
784
- GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
785
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
786
-
787
- free(cpu_plan->cplan.work_data);
788
- free(cpu_plan);
507
+ // Backend (reg)
789
508
 
790
- GGML_UNUSED(backend);
509
+ const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
510
+ return reg->iface.get_name(reg);
791
511
  }
792
512
 
793
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
794
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
795
-
796
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
797
-
798
- GGML_UNUSED(backend);
513
+ size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
514
+ return reg->iface.get_device_count(reg);
799
515
  }
800
516
 
801
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
802
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
803
-
804
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
805
-
806
- if (cpu_ctx->work_size < cplan.work_size) {
807
- free(cpu_ctx->work_data);
808
- cpu_ctx->work_data = malloc(cplan.work_size);
809
- if (cpu_ctx->work_data == NULL) {
810
- cpu_ctx->work_size = 0;
811
- return GGML_STATUS_ALLOC_FAILED;
812
- }
813
- cpu_ctx->work_size = cplan.work_size;
814
- }
815
- cplan.work_data = cpu_ctx->work_data;
816
-
817
- cplan.abort_callback = cpu_ctx->abort_callback;
818
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
819
-
820
- return ggml_graph_compute(cgraph, &cplan);
821
- }
822
-
823
- GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
824
- switch (op->op) {
825
- case GGML_OP_CPY:
826
- return
827
- op->type != GGML_TYPE_IQ2_XXS &&
828
- op->type != GGML_TYPE_IQ2_XS &&
829
- op->type != GGML_TYPE_IQ1_S &&
830
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
831
- case GGML_OP_MUL_MAT:
832
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
833
- default:
834
- return true;
835
- }
836
-
837
- GGML_UNUSED(backend);
838
- }
839
-
840
- GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
841
- return ggml_backend_buft_is_host(buft);
842
-
843
- GGML_UNUSED(backend);
844
- }
845
-
846
- static struct ggml_backend_i cpu_backend_i = {
847
- /* .get_name = */ ggml_backend_cpu_name,
848
- /* .free = */ ggml_backend_cpu_free,
849
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
850
- /* .set_tensor_async = */ NULL,
851
- /* .get_tensor_async = */ NULL,
852
- /* .cpy_tensor_async = */ NULL,
853
- /* .synchronize = */ NULL,
854
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
855
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
856
- /* .graph_plan_update = */ NULL,
857
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
858
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
859
- /* .supports_op = */ ggml_backend_cpu_supports_op,
860
- /* .supports_buft = */ ggml_backend_cpu_supports_buft,
861
- /* .offload_op = */ NULL,
862
- /* .event_new = */ NULL,
863
- /* .event_free = */ NULL,
864
- /* .event_record = */ NULL,
865
- /* .event_wait = */ NULL,
866
- /* .event_synchronize = */ NULL,
867
- };
868
-
869
- static ggml_guid_t ggml_backend_cpu_guid(void) {
870
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
871
- return &guid;
517
+ ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
518
+ return reg->iface.get_device(reg, index);
872
519
  }
873
520
 
874
- ggml_backend_t ggml_backend_cpu_init(void) {
875
- struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
876
- if (ctx == NULL) {
877
- return NULL;
878
- }
879
-
880
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
881
- ctx->work_data = NULL;
882
- ctx->work_size = 0;
883
- ctx->abort_callback = NULL;
884
- ctx->abort_callback_data = NULL;
885
-
886
- ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
887
- if (cpu_backend == NULL) {
888
- free(ctx);
521
+ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
522
+ if (!reg->iface.get_proc_address) {
889
523
  return NULL;
890
524
  }
891
-
892
- *cpu_backend = (struct ggml_backend) {
893
- /* .guid = */ ggml_backend_cpu_guid(),
894
- /* .interface = */ cpu_backend_i,
895
- /* .context = */ ctx
896
- };
897
- return cpu_backend;
898
- }
899
-
900
- GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
901
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
902
- }
903
-
904
- void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
905
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
906
-
907
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
908
- ctx->n_threads = n_threads;
909
- }
910
-
911
- void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
912
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
913
-
914
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
915
- ctx->abort_callback = abort_callback;
916
- ctx->abort_callback_data = abort_callback_data;
917
- }
918
-
919
- GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
920
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
921
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
922
- }
923
-
924
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
925
- return ggml_backend_cpu_init();
926
-
927
- GGML_UNUSED(params);
928
- GGML_UNUSED(user_data);
525
+ return reg->iface.get_proc_address(reg, name);
929
526
  }
930
527
 
931
528
  // multi-buffer buffer
@@ -935,16 +532,8 @@ struct ggml_backend_multi_buffer_context {
935
532
  size_t n_buffers;
936
533
  };
937
534
 
938
- typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
939
-
940
- GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
941
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
942
-
943
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
944
- }
945
-
946
- GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
947
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
535
+ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
536
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
948
537
  for (size_t i = 0; i < ctx->n_buffers; i++) {
949
538
  ggml_backend_buffer_free(ctx->buffers[i]);
950
539
  }
@@ -953,31 +542,27 @@ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_
953
542
  free(ctx);
954
543
  }
955
544
 
956
- GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
957
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
545
+ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
546
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
958
547
  for (size_t i = 0; i < ctx->n_buffers; i++) {
959
548
  ggml_backend_buffer_clear(ctx->buffers[i], value);
960
549
  }
961
550
  }
962
551
 
963
- static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
964
- static struct ggml_backend_buffer_i multi_backend_buffer_i = {
965
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
966
- /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
967
- /* .get_base = */ NULL,
968
- /* .init_tensor = */ NULL,
969
- /* .set_tensor = */ NULL,
970
- /* .get_tensor = */ NULL,
971
- /* .cpy_tensor = */ NULL,
972
- /* .clear = */ ggml_backend_multi_buffer_clear,
973
- /* .reset = */ NULL,
974
- };
975
-
976
- return multi_backend_buffer_i;
977
- }
552
+ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
553
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
554
+ /* .get_base = */ NULL,
555
+ /* .init_tensor = */ NULL,
556
+ /* .memset_tensor = */ NULL,
557
+ /* .set_tensor = */ NULL,
558
+ /* .get_tensor = */ NULL,
559
+ /* .cpy_tensor = */ NULL,
560
+ /* .clear = */ ggml_backend_multi_buffer_clear,
561
+ /* .reset = */ NULL,
562
+ };
978
563
 
979
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
980
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
564
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
565
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
981
566
  ctx->n_buffers = n_buffers;
982
567
  ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
983
568
 
@@ -989,16 +574,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
989
574
  total_size += ggml_backend_buffer_get_size(buffers[i]);
990
575
  }
991
576
 
992
- return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
577
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
993
578
  }
994
579
 
995
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
996
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
580
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
581
+ return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
997
582
  }
998
583
 
999
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
584
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
1000
585
  GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
1001
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
586
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1002
587
  for (size_t i = 0; i < ctx->n_buffers; i++) {
1003
588
  ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1004
589
  }
@@ -1023,10 +608,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
1023
608
  #define GGML_SCHED_MAX_BACKENDS 16
1024
609
  #endif
1025
610
 
1026
- #ifndef GGML_SCHED_MAX_SPLITS
1027
- #define GGML_SCHED_MAX_SPLITS 2048
1028
- #endif
1029
-
1030
611
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1031
612
  #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1032
613
  #endif
@@ -1089,7 +670,7 @@ struct ggml_backend_sched {
1089
670
  char * context_buffer;
1090
671
  size_t context_buffer_size;
1091
672
 
1092
- bool debug;
673
+ int debug;
1093
674
  };
1094
675
 
1095
676
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1108,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1108
689
  }
1109
690
 
1110
691
  static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1111
- ggml_backend_buffer_t buffer = tensor->buffer;
692
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1112
693
  if (buffer == NULL) {
1113
694
  return -1;
1114
695
  }
@@ -1122,7 +703,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1122
703
  }
1123
704
 
1124
705
  #ifndef NDEBUG
1125
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
706
+ GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1126
707
  __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1127
708
  #endif
1128
709
 
@@ -1130,7 +711,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1130
711
  }
1131
712
 
1132
713
  #if 0
1133
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
714
+ #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
715
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1134
716
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1135
717
  #define GET_CAUSE(node) causes[hash_id(node)]
1136
718
  #else
@@ -1140,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
1140
722
 
1141
723
  // returns the backend that should be used for the node based on the current locations
1142
724
  static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1143
- // TODO: use supports_op to check if the backend supports the op
1144
-
1145
725
  // assign pre-allocated nodes to their backend
1146
726
  int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1147
727
  if (cur_backend_id != -1) {
@@ -1158,6 +738,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1158
738
  }
1159
739
  }
1160
740
 
741
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
742
+ // since the tensor is pre-allocated, it cannot be moved to another backend
743
+ GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
744
+ }
745
+
1161
746
  // graph input
1162
747
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1163
748
  cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
@@ -1171,7 +756,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1171
756
  if (src == NULL) {
1172
757
  continue;
1173
758
  }
1174
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
759
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
760
+ // not an ideal solution
761
+ if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1175
762
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1176
763
  // check if a backend with higher prio wants to offload the op
1177
764
  if (src_backend_id == sched->n_backends - 1) {
@@ -1205,32 +792,34 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1205
792
  for (int i = 0; i < graph->n_nodes; i++) {
1206
793
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1207
794
  ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1208
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
795
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1209
796
  sched->splits[cur_split].n_inputs);
1210
797
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1211
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
798
+ GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1212
799
  fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1213
800
  }
1214
- fprintf(stderr, "\n");
801
+ GGML_LOG_DEBUG("\n");
1215
802
  cur_split++;
1216
803
  }
1217
804
  struct ggml_tensor * node = graph->nodes[i];
1218
805
  if (ggml_is_view_op(node->op)) {
1219
806
  continue;
1220
807
  }
1221
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1222
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1223
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1224
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1225
- struct ggml_tensor * src = node->src[j];
1226
- if (src == NULL) {
1227
- continue;
808
+ if (sched->debug > 1) {
809
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
810
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
811
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
812
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
813
+ struct ggml_tensor * src = node->src[j];
814
+ if (src == NULL) {
815
+ continue;
816
+ }
817
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
818
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
819
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1228
820
  }
1229
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1230
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1231
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
821
+ GGML_LOG_DEBUG("\n");
1232
822
  }
1233
- fprintf(stderr, "\n");
1234
823
  }
1235
824
  }
1236
825
 
@@ -1295,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1295
884
  for (int i = 0; i < graph->n_nodes; i++) {
1296
885
  struct ggml_tensor * node = graph->nodes[i];
1297
886
  int * node_backend_id = &tensor_backend_id(node);
887
+ if (ggml_is_view_op(node->op)) {
888
+ continue;
889
+ }
1298
890
  // do not overwrite user assignments
1299
891
  if (*node_backend_id == -1) {
1300
892
  *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
@@ -1522,11 +1114,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1522
1114
  if (src == NULL) {
1523
1115
  continue;
1524
1116
  }
1525
- // check if a weight is on a different backend
1117
+ // check if a weight is on a different and incompatible backend
1526
1118
  // by starting a new split, the memory of the previously offloaded weights can be reused
1527
1119
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1528
1120
  int src_backend_id = tensor_backend_id(src);
1529
- if (src_backend_id != cur_backend_id) {
1121
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1530
1122
  need_new_split = true;
1531
1123
  break;
1532
1124
  }
@@ -1538,7 +1130,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1538
1130
  int src_backend_id = sched->hv_tensor_backend_ids[id];
1539
1131
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1540
1132
  if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1541
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1542
1133
  need_new_split = true;
1543
1134
  break;
1544
1135
  }
@@ -1551,10 +1142,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1551
1142
  i_split++;
1552
1143
  if (i_split >= sched->splits_capacity) {
1553
1144
  sched->splits_capacity *= 2;
1554
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1145
+ sched->splits = (ggml_backend_sched_split *)
1146
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1555
1147
  GGML_ASSERT(sched->splits != NULL);
1556
1148
  }
1557
- GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1558
1149
  split = &sched->splits[i_split];
1559
1150
  split->backend_id = node_backend_id;
1560
1151
  split->i_start = i;
@@ -1638,11 +1229,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1638
1229
  sched->prev_leaf_backend_ids = tmp;
1639
1230
  }
1640
1231
 
1641
- int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1232
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1642
1233
  if (sched->graph.size < graph_size) {
1643
1234
  sched->graph.size = graph_size;
1644
- sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1645
- sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1235
+ sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1236
+ sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1646
1237
  GGML_ASSERT(sched->graph.nodes != NULL);
1647
1238
  GGML_ASSERT(sched->graph.leafs != NULL);
1648
1239
  }
@@ -1690,6 +1281,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1690
1281
  for (int c = 0; c < sched->n_copies; c++) {
1691
1282
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1692
1283
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1284
+ assert(graph_copy->size > graph_copy->n_leafs);
1693
1285
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1694
1286
  }
1695
1287
  }
@@ -1703,6 +1295,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1703
1295
  for (int c = 0; c < sched->n_copies; c++) {
1704
1296
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1705
1297
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1298
+ assert(graph_copy->size > graph_copy->n_leafs);
1706
1299
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1707
1300
  }
1708
1301
  }
@@ -1713,6 +1306,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1713
1306
  for (int i = 0; i < graph->n_leafs; i++) {
1714
1307
  struct ggml_tensor * leaf = graph->leafs[i];
1715
1308
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1309
+ assert(graph_copy->size > graph_copy->n_leafs);
1716
1310
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1717
1311
  }
1718
1312
  }
@@ -1741,11 +1335,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1741
1335
  // the re-allocation may cause the split inputs to be moved to a different address
1742
1336
  ggml_backend_sched_synchronize(sched);
1743
1337
  #ifndef NDEBUG
1744
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1338
+ GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1745
1339
  #endif
1746
1340
  ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1747
1341
  if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1748
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1342
+ GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1749
1343
  return false;
1750
1344
  }
1751
1345
  }
@@ -1782,7 +1376,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1782
1376
  } else {
1783
1377
  ggml_backend_synchronize(split_backend);
1784
1378
  }
1785
- ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1379
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1380
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1381
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1382
+ ggml_backend_synchronize(input_backend);
1383
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1384
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1385
+ } else {
1386
+ ggml_backend_synchronize(split_backend);
1387
+ }
1388
+ ggml_backend_tensor_copy(input, input_cpy);
1389
+ }
1786
1390
  }
1787
1391
  }
1788
1392
 
@@ -1828,7 +1432,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1828
1432
  // record the event of this copy
1829
1433
  if (split->n_inputs > 0) {
1830
1434
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1831
- ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1435
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1832
1436
  }
1833
1437
  }
1834
1438
  }
@@ -1846,40 +1450,43 @@ ggml_backend_sched_t ggml_backend_sched_new(
1846
1450
  bool parallel) {
1847
1451
  GGML_ASSERT(n_backends > 0);
1848
1452
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1849
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1453
+ GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
1850
1454
 
1851
- struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1455
+ struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
1852
1456
 
1853
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1457
+ const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
1458
+ sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
1854
1459
  sched->n_backends = n_backends;
1855
1460
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1856
1461
 
1857
1462
  // initialize hash table
1858
1463
  // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1859
1464
  sched->hash_set = ggml_hash_set_new(graph_size);
1860
- sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1861
- sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1465
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1466
+ sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1862
1467
 
1863
- const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1865
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1866
- sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1867
- sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1468
+ const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1469
+ const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1470
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1471
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1472
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1473
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1868
1474
 
1869
- sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1870
- sched->context_buffer = malloc(sched->context_buffer_size);
1475
+ sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1476
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1871
1477
 
1872
1478
  const int initial_splits_capacity = 16;
1873
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1479
+ sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1874
1480
  sched->splits_capacity = initial_splits_capacity;
1875
1481
 
1876
1482
  for (int b = 0; b < n_backends; b++) {
1877
1483
  sched->backends[b] = backends[b];
1878
1484
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1879
1485
  GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1486
+
1880
1487
  if (sched->n_copies > 1) {
1881
1488
  for (int c = 0; c < sched->n_copies; c++) {
1882
- sched->events[b][c] = ggml_backend_event_new(backends[b]);
1489
+ sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
1883
1490
  }
1884
1491
  }
1885
1492
  }
@@ -1932,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1932
1539
 
1933
1540
  ggml_backend_sched_split_graph(sched, measure_graph);
1934
1541
 
1542
+ ggml_backend_sched_synchronize(sched);
1543
+
1935
1544
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1936
1545
  return false;
1937
1546
  }
1938
1547
 
1939
1548
  ggml_backend_sched_reset(sched);
1940
- ggml_backend_sched_synchronize(sched);
1941
1549
 
1942
1550
  return true;
1943
1551
  }
@@ -2115,8 +1723,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
2115
1723
 
2116
1724
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2117
1725
  struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
2118
- struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2119
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1726
+ struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1727
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2120
1728
 
2121
1729
  struct ggml_init_params params = {
2122
1730
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -2128,13 +1736,13 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2128
1736
  struct ggml_context * ctx_unallocated = ggml_init(params);
2129
1737
 
2130
1738
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2131
- fprintf(stderr, "failed to allocate context for graph copy\n");
1739
+ GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2132
1740
  ggml_hash_set_free(&hash_set);
2133
1741
  free(node_copies);
2134
1742
  free(node_init);
2135
1743
  ggml_free(ctx_allocated);
2136
1744
  ggml_free(ctx_unallocated);
2137
- return (struct ggml_backend_graph_copy) {
1745
+ return {
2138
1746
  /* .buffer = */ NULL,
2139
1747
  /* .ctx_allocated = */ NULL,
2140
1748
  /* .ctx_unallocated = */ NULL,
@@ -2151,13 +1759,13 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2151
1759
  // allocate nodes
2152
1760
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2153
1761
  if (buffer == NULL) {
2154
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
1762
+ GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2155
1763
  ggml_hash_set_free(&hash_set);
2156
1764
  free(node_copies);
2157
1765
  free(node_init);
2158
1766
  ggml_free(ctx_allocated);
2159
1767
  ggml_free(ctx_unallocated);
2160
- return (struct ggml_backend_graph_copy) {
1768
+ return {
2161
1769
  /* .buffer = */ NULL,
2162
1770
  /* .ctx_allocated = */ NULL,
2163
1771
  /* .ctx_unallocated = */ NULL,
@@ -2186,7 +1794,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2186
1794
  free(node_copies);
2187
1795
  free(node_init);
2188
1796
 
2189
- return (struct ggml_backend_graph_copy) {
1797
+ return {
2190
1798
  /* .buffer = */ buffer,
2191
1799
  /* .ctx_allocated = */ ctx_allocated,
2192
1800
  /* .ctx_unallocated = */ ctx_unallocated,
@@ -2238,3 +1846,154 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
2238
1846
 
2239
1847
  return true;
2240
1848
  }
1849
+
1850
+ // CPU backend - buffer
1851
+
1852
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
1853
+ uintptr_t data = (uintptr_t)buffer->context;
1854
+
1855
+ // align the buffer
1856
+ if (data % TENSOR_ALIGNMENT != 0) {
1857
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
1858
+ }
1859
+
1860
+ return (void *)data;
1861
+ }
1862
+
1863
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1864
+ ggml_aligned_free(buffer->context, buffer->size);
1865
+ }
1866
+
1867
+ static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1868
+ memset((char *)tensor->data + offset, value, size);
1869
+
1870
+ GGML_UNUSED(buffer);
1871
+ }
1872
+
1873
+ static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1874
+ memcpy((char *)tensor->data + offset, data, size);
1875
+
1876
+ GGML_UNUSED(buffer);
1877
+ }
1878
+
1879
+ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1880
+ memcpy(data, (const char *)tensor->data + offset, size);
1881
+
1882
+ GGML_UNUSED(buffer);
1883
+ }
1884
+
1885
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
1886
+ if (ggml_backend_buffer_is_host(src->buffer)) {
1887
+ memcpy(dst->data, src->data, ggml_nbytes(src));
1888
+ return true;
1889
+ }
1890
+ return false;
1891
+
1892
+ GGML_UNUSED(buffer);
1893
+ }
1894
+
1895
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1896
+ memset(buffer->context, value, buffer->size);
1897
+ }
1898
+
1899
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
1900
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
1901
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1902
+ /* .init_tensor = */ NULL, // no initialization required
1903
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1904
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1905
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1906
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1907
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1908
+ /* .reset = */ NULL,
1909
+ };
1910
+
1911
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
1912
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
1913
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1914
+ /* .init_tensor = */ NULL, // no initialization required
1915
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1916
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1917
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1918
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1919
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1920
+ /* .reset = */ NULL,
1921
+ };
1922
+
1923
+ // CPU backend buffer type
1924
+
1925
+ // this buffer type is defined here to make it available to all backends
1926
+
1927
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1928
+ return "CPU";
1929
+
1930
+ GGML_UNUSED(buft);
1931
+ }
1932
+
1933
+ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1934
+ void * data = ggml_aligned_malloc(size);
1935
+
1936
+ if (data == NULL) {
1937
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
1938
+ return NULL;
1939
+ }
1940
+
1941
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
1942
+ }
1943
+
1944
+ static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1945
+ return TENSOR_ALIGNMENT;
1946
+
1947
+ GGML_UNUSED(buft);
1948
+ }
1949
+
1950
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1951
+ return true;
1952
+
1953
+ GGML_UNUSED(buft);
1954
+ }
1955
+
1956
+ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
1957
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1958
+ /* .iface = */ {
1959
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
1960
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1961
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1962
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1963
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1964
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1965
+ },
1966
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1967
+ /* .context = */ NULL,
1968
+ };
1969
+
1970
+ return &ggml_backend_cpu_buffer_type;
1971
+ }
1972
+
1973
+ static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
1974
+ return "CPU_Mapped";
1975
+
1976
+ GGML_UNUSED(buft);
1977
+ }
1978
+
1979
+ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
1980
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1981
+ /* .iface = */ {
1982
+ /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
1983
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1984
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1985
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1986
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1987
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1988
+ },
1989
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1990
+ /* .context = */ NULL,
1991
+ };
1992
+
1993
+ return &ggml_backend_cpu_buffer_type;
1994
+ }
1995
+
1996
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1997
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1998
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1999
+ }