@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -8,6 +8,7 @@
8
8
  #include <windows.h>
9
9
  #endif
10
10
 
11
+ #include "ggml-backend.h"
11
12
  #include "ggml-backend-impl.h"
12
13
  #include "ggml-alloc.h"
13
14
  #include "ggml-impl.h"
@@ -34,6 +35,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
34
35
  }
35
36
 
36
37
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
37
43
  return buft->iface.alloc_buffer(buft, size);
38
44
  }
39
45
 
@@ -89,7 +95,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
89
95
  }
90
96
 
91
97
  const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
92
- return buffer->iface.get_name(buffer);
98
+ return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
93
99
  }
94
100
 
95
101
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -108,6 +114,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
108
114
  }
109
115
 
110
116
  void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
111
122
  void * base = buffer->iface.get_base(buffer);
112
123
 
113
124
  GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +133,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
122
133
  }
123
134
  }
124
135
 
136
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
125
145
  size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
126
146
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
127
147
  }
@@ -134,10 +154,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
134
154
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
135
155
  }
136
156
 
137
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
138
- buffer->iface.clear(buffer, value);
139
- }
140
-
141
157
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
142
158
  return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
143
159
  }
@@ -198,7 +214,7 @@ void ggml_backend_free(ggml_backend_t backend) {
198
214
  }
199
215
 
200
216
  ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
201
- return backend->iface.get_default_buffer_type(backend);
217
+ return ggml_backend_dev_buffer_type(backend->device);
202
218
  }
203
219
 
204
220
  ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@@ -236,45 +252,46 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
236
252
  }
237
253
 
238
254
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
255
+ GGML_ASSERT(tensor);
239
256
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
257
 
258
+ if (size == 0) {
259
+ return;
260
+ }
261
+
241
262
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
263
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
264
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
244
265
 
245
- if (!size) {
246
- return;
247
- }
248
-
249
266
  buf->iface.set_tensor(buf, tensor, data, offset, size);
250
267
  }
251
268
 
252
269
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
270
+ GGML_ASSERT(tensor);
253
271
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
272
 
273
+ if (size == 0) {
274
+ return;
275
+ }
276
+
255
277
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
278
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
279
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
258
280
 
259
- if (!size) {
260
- return;
261
- }
262
-
263
281
  buf->iface.get_tensor(buf, tensor, data, offset, size);
264
282
  }
265
283
 
266
- GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
284
+ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
285
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
286
 
269
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
272
-
273
- if (!size) {
287
+ if (size == 0) {
274
288
  return;
275
289
  }
276
290
 
277
- GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
291
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
292
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
293
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
294
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278
295
 
279
296
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
297
  }
@@ -316,33 +333,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
316
333
  }
317
334
 
318
335
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
319
- // helper to ease transition to device interface
320
- if (backend->device) {
321
- return ggml_backend_dev_supports_op(backend->device, op);
322
- }
323
-
324
- return backend->iface.supports_op(backend, op);
336
+ return ggml_backend_dev_supports_op(backend->device, op);
325
337
  }
326
338
 
327
339
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
328
- // helper to ease transition to device interface
329
- if (backend->device) {
330
- return ggml_backend_dev_supports_buft(backend->device, buft);
331
- }
332
-
333
- return backend->iface.supports_buft(backend, buft);
340
+ return ggml_backend_dev_supports_buft(backend->device, buft);
334
341
  }
335
342
 
336
343
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
337
- // helper to ease transition to device interface
338
- if (backend->device) {
339
- return ggml_backend_dev_offload_op(backend->device, op);
340
- }
341
-
342
- if (backend->iface.offload_op != NULL) {
343
- return backend->iface.offload_op(backend, op);
344
- }
345
- return false;
344
+ return ggml_backend_dev_offload_op(backend->device, op);
346
345
  }
347
346
 
348
347
  ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
@@ -379,7 +378,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
379
378
  ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
380
379
  } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
381
380
  #ifndef NDEBUG
382
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
381
+ GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
383
382
  #endif
384
383
  size_t nbytes = ggml_nbytes(src);
385
384
  void * data = malloc(nbytes);
@@ -404,837 +403,128 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
404
403
 
405
404
  // an async copy would normally happen after all the queued operations on both backends are completed
406
405
  // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
407
- ggml_backend_synchronize(backend_src);
408
- ggml_backend_synchronize(backend_dst);
409
- ggml_backend_tensor_copy(src, dst);
410
- }
411
-
412
- // events
413
-
414
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
415
- // null device is allowed for the transition period to the device interface
416
- if (device == NULL || device->iface.event_new == NULL) {
417
- return NULL;
418
- }
419
- return device->iface.event_new(device);
420
- }
421
-
422
- void ggml_backend_event_free(ggml_backend_event_t event) {
423
- if (event == NULL) {
424
- return;
425
- }
426
- event->device->iface.event_free(event->device, event);
427
- }
428
-
429
- void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
430
- GGML_ASSERT(backend->iface.event_record != NULL);
431
-
432
- backend->iface.event_record(backend, event);
433
- }
434
-
435
- void ggml_backend_event_synchronize(ggml_backend_event_t event) {
436
- GGML_ASSERT(event->device->iface.event_synchronize);
437
-
438
- event->device->iface.event_synchronize(event->device, event);
439
- }
440
-
441
- void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
442
- GGML_ASSERT(backend->iface.event_wait != NULL);
443
-
444
- backend->iface.event_wait(backend, event);
445
- }
446
-
447
- // Backend device
448
-
449
- const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
450
- return device->iface.get_name(device);
451
- }
452
-
453
- const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
454
- return device->iface.get_description(device);
455
- }
456
-
457
- void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
458
- device->iface.get_memory(device, free, total);
459
- }
460
-
461
- enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
462
- return device->iface.get_type(device);
463
- }
464
-
465
- void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
466
- device->iface.get_props(device, props);
467
- }
468
-
469
- ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
470
- return device->reg;
471
- }
472
-
473
- ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
474
- return device->iface.init_backend(device, params);
475
- }
476
-
477
- ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
478
- return device->iface.get_buffer_type(device);
479
- }
480
-
481
- ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
482
- return device->iface.get_host_buffer_type(device);
483
- }
484
-
485
- ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
486
- return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
487
- }
488
-
489
- bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
490
- return device->iface.supports_op(device, op);
491
- }
492
-
493
- bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
494
- return device->iface.supports_buft(device, buft);
495
- }
496
-
497
- bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
498
- return device->iface.offload_op(device, op);
499
- }
500
-
501
- // Backend (reg)
502
-
503
- const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
504
- return reg->iface.get_name(reg);
505
- }
506
-
507
- size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
508
- return reg->iface.get_device_count(reg);
509
- }
510
-
511
- ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
512
- return reg->iface.get_device(reg, index);
513
- }
514
-
515
- void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
516
- if (!reg->iface.get_proc_address) {
517
- return NULL;
518
- }
519
- return reg->iface.get_proc_address(reg, name);
520
- }
521
-
522
- // Backend registry
523
-
524
- #ifdef GGML_USE_CUDA
525
- #include "ggml-cuda.h"
526
- #endif
527
-
528
- struct ggml_backend_registry {
529
- std::vector<ggml_backend_reg_t> backends;
530
- std::vector<ggml_backend_dev_t> devices;
531
-
532
- ggml_backend_registry() {
533
- #ifdef GGML_USE_CUDA
534
- register_backend(ggml_backend_cuda_reg());
535
- #endif
536
-
537
- register_backend(ggml_backend_cpu_reg());
538
-
539
- // TODO: sycl, metal, vulkan, kompute, cann
540
- }
541
-
542
- void register_backend(ggml_backend_reg_t reg) {
543
- #ifndef NDEBUG
544
- fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
545
- __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
546
- #endif
547
- backends.push_back(reg);
548
- for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
549
- register_device(ggml_backend_reg_dev_get(reg, i));
550
- }
551
- }
552
-
553
- void register_device(ggml_backend_dev_t device) {
554
- #ifndef NDEBUG
555
- fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
556
- #endif
557
- devices.push_back(device);
558
- }
559
- };
560
-
561
- static ggml_backend_registry & get_reg() {
562
- static ggml_backend_registry reg;
563
- return reg;
564
- }
565
-
566
- // Internal API
567
- void ggml_backend_register(ggml_backend_reg_t reg) {
568
- get_reg().register_backend(reg);
569
- }
570
-
571
- void ggml_backend_device_register(ggml_backend_dev_t device) {
572
- get_reg().register_device(device);
573
- }
574
-
575
- // Backend (reg) enumeration
576
- size_t ggml_backend_reg_count() {
577
- return get_reg().backends.size();
578
- }
579
-
580
- ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
581
- GGML_ASSERT(index < ggml_backend_reg_count());
582
- return get_reg().backends[index];
583
- }
584
-
585
- ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
586
- for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
587
- ggml_backend_reg_t reg = ggml_backend_reg_get(i);
588
- if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
589
- return reg;
590
- }
591
- }
592
- return NULL;
593
- }
594
-
595
- // Device enumeration
596
- size_t ggml_backend_dev_count() {
597
- return get_reg().devices.size();
598
- }
599
-
600
- ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
601
- GGML_ASSERT(index < ggml_backend_dev_count());
602
- return get_reg().devices[index];
603
- }
604
-
605
- ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
606
- for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
607
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
608
- if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
609
- return dev;
610
- }
611
- }
612
- return NULL;
613
- }
614
-
615
- ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
616
- for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
617
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
618
- if (ggml_backend_dev_type(dev) == type) {
619
- return dev;
620
- }
621
- }
622
- return NULL;
623
- }
624
-
625
- // Convenience functions
626
- ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
627
- ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
628
- if (!dev) {
629
- return NULL;
630
- }
631
- return ggml_backend_dev_init(dev, params);
632
- }
633
-
634
- ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
635
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
636
- if (!dev) {
637
- return NULL;
638
- }
639
- return ggml_backend_dev_init(dev, params);
640
- }
641
-
642
- ggml_backend_t ggml_backend_init_best(void) {
643
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
644
- if (!dev) {
645
- dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
646
- }
647
- if (!dev) {
648
- return NULL;
649
- }
650
- return ggml_backend_dev_init(dev, NULL);
651
- }
652
-
653
- // backend CPU
654
-
655
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
656
-
657
- static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
658
- return "CPU";
659
-
660
- GGML_UNUSED(buffer);
661
- }
662
-
663
- static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
664
- uintptr_t data = (uintptr_t)buffer->context;
665
-
666
- // align the buffer
667
- if (data % TENSOR_ALIGNMENT != 0) {
668
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
669
- }
670
-
671
- return (void *)data;
672
- }
673
-
674
- static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
675
- free(buffer->context);
676
- }
677
-
678
- static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
679
- memset((char *)tensor->data + offset, value, size);
680
-
681
- GGML_UNUSED(buffer);
682
- }
683
-
684
- static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
685
- memcpy((char *)tensor->data + offset, data, size);
686
-
687
- GGML_UNUSED(buffer);
688
- }
689
-
690
- static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
691
- memcpy(data, (const char *)tensor->data + offset, size);
692
-
693
- GGML_UNUSED(buffer);
694
- }
695
-
696
- static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
697
- if (ggml_backend_buffer_is_host(src->buffer)) {
698
- memcpy(dst->data, src->data, ggml_nbytes(src));
699
- return true;
700
- }
701
- return false;
702
-
703
- GGML_UNUSED(buffer);
704
- }
705
-
706
- static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
707
- memset(buffer->context, value, buffer->size);
708
- }
709
-
710
- static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
711
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
712
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
713
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
714
- /* .init_tensor = */ NULL, // no initialization required
715
- /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
716
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
717
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
718
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
719
- /* .clear = */ ggml_backend_cpu_buffer_clear,
720
- /* .reset = */ NULL,
721
- };
722
-
723
- static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
724
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
725
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
726
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
727
- /* .init_tensor = */ NULL, // no initialization required
728
- /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
729
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
730
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
731
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
732
- /* .clear = */ ggml_backend_cpu_buffer_clear,
733
- /* .reset = */ NULL,
734
- };
735
-
736
- static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
737
- return "CPU";
738
-
739
- GGML_UNUSED(buft);
740
- }
741
-
742
- static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
743
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
744
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
745
- if (data == NULL) {
746
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
747
- return NULL;
748
- }
749
-
750
- return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
751
- }
752
-
753
- static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
754
- return TENSOR_ALIGNMENT;
755
-
756
- GGML_UNUSED(buft);
757
- }
758
-
759
- static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
760
- return true;
761
-
762
- GGML_UNUSED(buft);
763
- }
764
-
765
- ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
766
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
767
- /* .iface = */ {
768
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
769
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
770
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
771
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
772
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
773
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
774
- },
775
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
776
- /* .context = */ NULL,
777
- };
778
-
779
- return &ggml_backend_cpu_buffer_type;
780
- }
781
-
782
- #ifdef GGML_USE_CPU_HBM
783
-
784
- // buffer type HBM
785
-
786
- #include <hbwmalloc.h>
787
-
788
- static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
789
- return "CPU_HBM";
790
-
791
- GGML_UNUSED(buft);
792
- }
793
-
794
- static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
795
- return "CPU_HBM";
796
-
797
- GGML_UNUSED(buf);
798
- }
799
-
800
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
801
- hbw_free(buffer->context);
802
- }
803
-
804
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
805
- //void * ptr = hbw_malloc(size);
806
- void * ptr;
807
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
808
- if (result != 0) {
809
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
810
- return NULL;
811
- }
812
-
813
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
814
- buffer->buft = buft;
815
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
816
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
817
-
818
- return buffer;
819
- }
820
-
821
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
822
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
823
- /* .iface = */ {
824
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
825
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
826
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
827
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
828
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
829
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
830
- },
831
- /* .context = */ NULL,
832
- };
833
-
834
- return &ggml_backend_cpu_buffer_type_hbm;
835
- }
836
- #endif
837
-
838
- struct ggml_backend_cpu_context {
839
- int n_threads;
840
- ggml_threadpool_t threadpool;
841
-
842
- uint8_t * work_data;
843
- size_t work_size;
844
-
845
- ggml_abort_callback abort_callback;
846
- void * abort_callback_data;
847
- };
848
-
849
- static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
850
- return "CPU";
851
-
852
- GGML_UNUSED(backend);
853
- }
854
-
855
- static void ggml_backend_cpu_free(ggml_backend_t backend) {
856
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
857
- delete[] cpu_ctx->work_data;
858
- delete cpu_ctx;
859
- delete backend;
860
- }
861
-
862
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
863
- return ggml_backend_cpu_buffer_type();
864
-
865
- GGML_UNUSED(backend);
866
- }
867
-
868
- struct ggml_backend_plan_cpu {
869
- struct ggml_cplan cplan;
870
- struct ggml_cgraph cgraph;
871
- };
872
-
873
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
874
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
875
-
876
- struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
877
-
878
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
879
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
880
-
881
- if (cpu_plan->cplan.work_size > 0) {
882
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
883
- if (cpu_plan->cplan.work_data == NULL) {
884
- delete cpu_plan;
885
- return NULL;
886
- }
887
- }
888
-
889
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
890
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
891
-
892
- return cpu_plan;
893
- }
894
-
895
- static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
896
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
897
-
898
- delete[] cpu_plan->cplan.work_data;
899
- delete cpu_plan;
900
-
901
- GGML_UNUSED(backend);
902
- }
903
-
904
- static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
905
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
906
-
907
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
908
-
909
- GGML_UNUSED(backend);
910
- }
911
-
912
- static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
913
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
914
-
915
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
916
-
917
- if (cpu_ctx->work_size < cplan.work_size) {
918
- delete[] cpu_ctx->work_data;
919
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
920
- if (cpu_ctx->work_data == NULL) {
921
- cpu_ctx->work_size = 0;
922
- return GGML_STATUS_ALLOC_FAILED;
923
- }
924
- cpu_ctx->work_size = cplan.work_size;
925
- }
926
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
927
-
928
- cplan.abort_callback = cpu_ctx->abort_callback;
929
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
930
-
931
- return ggml_graph_compute(cgraph, &cplan);
932
- }
933
-
934
- static const struct ggml_backend_i ggml_backend_cpu_i = {
935
- /* .get_name = */ ggml_backend_cpu_get_name,
936
- /* .free = */ ggml_backend_cpu_free,
937
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
938
- /* .set_tensor_async = */ NULL,
939
- /* .get_tensor_async = */ NULL,
940
- /* .cpy_tensor_async = */ NULL,
941
- /* .synchronize = */ NULL,
942
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
943
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
944
- /* .graph_plan_update = */ NULL,
945
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
946
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
947
- /* .supports_op = */ NULL,
948
- /* .supports_buft = */ NULL,
949
- /* .offload_op = */ NULL,
950
- /* .event_record = */ NULL,
951
- /* .event_wait = */ NULL,
952
- };
953
-
954
- static ggml_guid_t ggml_backend_cpu_guid(void) {
955
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
956
- return &guid;
957
- }
958
-
959
- ggml_backend_t ggml_backend_cpu_init(void) {
960
- struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
961
- if (ctx == NULL) {
962
- return NULL;
963
- }
964
-
965
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
966
- ctx->threadpool = NULL;
967
- ctx->work_data = NULL;
968
- ctx->work_size = 0;
969
- ctx->abort_callback = NULL;
970
- ctx->abort_callback_data = NULL;
971
-
972
- ggml_backend_t cpu_backend = new ggml_backend {
973
- /* .guid = */ ggml_backend_cpu_guid(),
974
- /* .interface = */ ggml_backend_cpu_i,
975
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
976
- /* .context = */ ctx,
977
- };
978
-
979
- if (cpu_backend == NULL) {
980
- delete ctx;
981
- return NULL;
982
- }
983
-
984
- return cpu_backend;
985
- }
986
-
987
- bool ggml_backend_is_cpu(ggml_backend_t backend) {
988
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
989
- }
990
-
991
- void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
992
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
993
-
994
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
995
- ctx->n_threads = n_threads;
996
- }
997
-
998
- void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
999
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
1000
-
1001
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1002
-
1003
- if (ctx->threadpool && ctx->threadpool != threadpool) {
1004
- // already had a different threadpool, pause/suspend it before switching
1005
- ggml_threadpool_pause(ctx->threadpool);
1006
- }
1007
- ctx->threadpool = threadpool;
1008
- }
1009
-
1010
- void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
1011
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
1012
-
1013
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1014
- ctx->abort_callback = abort_callback;
1015
- ctx->abort_callback_data = abort_callback_data;
1016
- }
1017
-
1018
- ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1019
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1020
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
406
+ ggml_backend_synchronize(backend_src);
407
+ ggml_backend_synchronize(backend_dst);
408
+ ggml_backend_tensor_copy(src, dst);
1021
409
  }
1022
410
 
1023
- ////////////////////////
411
+ // events
1024
412
 
1025
- struct ggml_backend_cpu_device_context {
1026
- std::string description = "CPU";
413
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
414
+ // null device is allowed for the transition period to the device interface
415
+ if (device == NULL || device->iface.event_new == NULL) {
416
+ return NULL;
417
+ }
418
+ return device->iface.event_new(device);
419
+ }
1027
420
 
1028
- ggml_backend_cpu_device_context() {
1029
- #ifdef __APPLE__
1030
- size_t len = 0;
1031
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1032
- description.resize(len);
1033
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1034
- }
1035
- #elif defined(__linux__)
1036
- FILE * f = fopen("/proc/cpuinfo", "r");
1037
- if (f) {
1038
- char buf[1024];
1039
- while (fgets(buf, sizeof(buf), f)) {
1040
- if (strncmp(buf, "model name", 10) == 0) {
1041
- char * p = strchr(buf, ':');
1042
- if (p) {
1043
- p++;
1044
- while (std::isspace(*p)) {
1045
- p++;
1046
- }
1047
- while (std::isspace(p[strlen(p) - 1])) {
1048
- p[strlen(p) - 1] = '\0';
1049
- }
1050
- description = p;
1051
- break;
1052
- }
1053
- }
1054
- }
1055
- fclose(f);
1056
- }
1057
- #elif defined(_WIN32)
1058
- HKEY hKey;
1059
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1060
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1061
- 0,
1062
- KEY_READ,
1063
- &hKey) == ERROR_SUCCESS) {
1064
- DWORD cpu_brand_size = 0;
1065
- if (RegQueryValueExA(hKey,
1066
- TEXT("ProcessorNameString"),
1067
- NULL,
1068
- NULL,
1069
- NULL,
1070
- &cpu_brand_size) == ERROR_SUCCESS) {
1071
- description.resize(cpu_brand_size);
1072
- if (RegQueryValueExA(hKey,
1073
- TEXT("ProcessorNameString"),
1074
- NULL,
1075
- NULL,
1076
- (LPBYTE)&description[0], // NOLINT
1077
- &cpu_brand_size) == ERROR_SUCCESS) {
1078
- if (description.find('\0') != std::string::npos) {
1079
- description.resize(description.find('\0'));
1080
- }
1081
- }
1082
- }
1083
- RegCloseKey(hKey);
1084
- }
1085
- #endif
421
+ void ggml_backend_event_free(ggml_backend_event_t event) {
422
+ if (event == NULL) {
423
+ return;
1086
424
  }
1087
- };
425
+ event->device->iface.event_free(event->device, event);
426
+ }
1088
427
 
1089
- static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1090
- return "CPU";
428
+ void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
429
+ GGML_ASSERT(backend->iface.event_record != NULL);
1091
430
 
1092
- GGML_UNUSED(dev);
431
+ backend->iface.event_record(backend, event);
1093
432
  }
1094
433
 
1095
- static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
1096
- struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
434
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
435
+ GGML_ASSERT(event->device->iface.event_synchronize);
1097
436
 
1098
- return ctx->description.c_str();
437
+ event->device->iface.event_synchronize(event->device, event);
1099
438
  }
1100
439
 
1101
- static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1102
- // TODO
1103
- *free = 0;
1104
- *total = 0;
440
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
441
+ GGML_ASSERT(backend->iface.event_wait != NULL);
1105
442
 
1106
- GGML_UNUSED(dev);
443
+ backend->iface.event_wait(backend, event);
1107
444
  }
1108
445
 
1109
- static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
1110
- return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
446
+ // Backend device
1111
447
 
1112
- GGML_UNUSED(dev);
448
+ const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
449
+ return device->iface.get_name(device);
1113
450
  }
1114
451
 
1115
- static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
1116
- props->name = ggml_backend_cpu_device_get_name(dev);
1117
- props->description = ggml_backend_cpu_device_get_description(dev);
1118
- props->type = ggml_backend_cpu_device_get_type(dev);
1119
- ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1120
- props->caps = {
1121
- /* async */ false,
1122
- /* host_buffer */ false,
1123
- /* events */ false,
1124
- };
452
+ const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
453
+ return device->iface.get_description(device);
1125
454
  }
1126
455
 
1127
- static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
1128
- return ggml_backend_cpu_init();
456
+ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
457
+ device->iface.get_memory(device, free, total);
458
+ }
1129
459
 
1130
- GGML_UNUSED(dev);
1131
- GGML_UNUSED(params);
460
+ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
461
+ return device->iface.get_type(device);
1132
462
  }
1133
463
 
1134
- static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
1135
- return ggml_backend_cpu_buffer_type();
464
+ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
465
+ memset(props, 0, sizeof(*props));
466
+ device->iface.get_props(device, props);
467
+ }
1136
468
 
1137
- GGML_UNUSED(dev);
469
+ ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
470
+ return device->reg;
1138
471
  }
1139
472
 
1140
- static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1141
- return ggml_backend_cpu_buffer_from_ptr(ptr, size);
473
+ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
474
+ return device->iface.init_backend(device, params);
475
+ }
1142
476
 
1143
- GGML_UNUSED(dev);
1144
- GGML_UNUSED(max_tensor_size);
477
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
478
+ return device->iface.get_buffer_type(device);
1145
479
  }
1146
480
 
1147
- static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1148
- switch (op->op) {
1149
- case GGML_OP_CPY:
1150
- return
1151
- op->type != GGML_TYPE_IQ2_XXS &&
1152
- op->type != GGML_TYPE_IQ2_XS &&
1153
- op->type != GGML_TYPE_IQ1_S &&
1154
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
1155
- case GGML_OP_MUL_MAT:
1156
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
1157
- case GGML_OP_ROPE_BACK:
1158
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1159
- case GGML_OP_IM2COL_BACK:
1160
- return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
1161
- case GGML_OP_OUT_PROD:
1162
- return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
1163
- default:
1164
- return true;
481
+ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
482
+ if (device->iface.get_host_buffer_type == NULL) {
483
+ return NULL;
1165
484
  }
1166
485
 
1167
- GGML_UNUSED(dev);
486
+ return device->iface.get_host_buffer_type(device);
1168
487
  }
1169
488
 
1170
- static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1171
- return ggml_backend_buft_is_host(buft);
1172
-
1173
- GGML_UNUSED(dev);
489
+ ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
490
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
1174
491
  }
1175
492
 
1176
- static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
1177
- /* .get_name = */ ggml_backend_cpu_device_get_name,
1178
- /* .get_description = */ ggml_backend_cpu_device_get_description,
1179
- /* .get_memory = */ ggml_backend_cpu_device_get_memory,
1180
- /* .get_type = */ ggml_backend_cpu_device_get_type,
1181
- /* .get_props = */ ggml_backend_cpu_device_get_props,
1182
- /* .init_backend = */ ggml_backend_cpu_device_init,
1183
- /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
1184
- /* .get_host_buffer_type = */ NULL,
1185
- /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
1186
- /* .supports_op = */ ggml_backend_cpu_device_supports_op,
1187
- /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
1188
- /* .offload_op = */ NULL,
1189
- /* .event_new = */ NULL,
1190
- /* .event_free = */ NULL,
1191
- /* .event_synchronize = */ NULL,
1192
- };
1193
-
1194
- ////////////////////////
1195
-
1196
- static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
1197
- return "CPU";
1198
-
1199
- GGML_UNUSED(reg);
493
+ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
494
+ return device->iface.supports_op(device, op);
1200
495
  }
1201
496
 
1202
- static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
1203
- return 1;
1204
-
1205
- GGML_UNUSED(reg);
497
+ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
498
+ return device->iface.supports_buft(device, buft);
1206
499
  }
1207
500
 
1208
- static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1209
- GGML_ASSERT(index == 0);
501
+ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
502
+ if (device->iface.offload_op != NULL) {
503
+ return device->iface.offload_op(device, op);
504
+ }
1210
505
 
1211
- static ggml_backend_cpu_device_context ctx;
1212
- static ggml_backend_device ggml_backend_cpu_device = {
1213
- /* .iface = */ ggml_backend_cpu_device_i,
1214
- /* .reg = */ reg,
1215
- /* .context = */ &ctx,
1216
- };
506
+ return false;
507
+ }
1217
508
 
1218
- return &ggml_backend_cpu_device;
509
+ // Backend (reg)
1219
510
 
1220
- GGML_UNUSED(reg);
1221
- GGML_UNUSED(index);
511
+ const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
512
+ return reg->iface.get_name(reg);
1222
513
  }
1223
514
 
1224
- static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
1225
- /* .get_name = */ ggml_backend_cpu_reg_get_name,
1226
- /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
1227
- /* .get_device = */ ggml_backend_cpu_reg_get_device,
1228
- /* .get_proc_address = */ NULL,
1229
- };
515
+ size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
516
+ return reg->iface.get_device_count(reg);
517
+ }
1230
518
 
1231
- ggml_backend_reg_t ggml_backend_cpu_reg(void) {
1232
- static struct ggml_backend_reg ggml_backend_cpu_reg = {
1233
- /* .iface = */ ggml_backend_cpu_reg_i,
1234
- /* .context = */ NULL,
1235
- };
519
+ ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
520
+ return reg->iface.get_device(reg, index);
521
+ }
1236
522
 
1237
- return &ggml_backend_cpu_reg;
523
+ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
524
+ if (!reg->iface.get_proc_address) {
525
+ return NULL;
526
+ }
527
+ return reg->iface.get_proc_address(reg, name);
1238
528
  }
1239
529
 
1240
530
  // multi-buffer buffer
@@ -1244,12 +534,6 @@ struct ggml_backend_multi_buffer_context {
1244
534
  size_t n_buffers;
1245
535
  };
1246
536
 
1247
- static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1248
- ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1249
-
1250
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1251
- }
1252
-
1253
537
  static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1254
538
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1255
539
  for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1268,7 +552,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
1268
552
  }
1269
553
 
1270
554
  static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1271
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
1272
555
  /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1273
556
  /* .get_base = */ NULL,
1274
557
  /* .init_tensor = */ NULL,
@@ -1297,7 +580,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
1297
580
  }
1298
581
 
1299
582
  bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1300
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
583
+ return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
1301
584
  }
1302
585
 
1303
586
  void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
@@ -1389,7 +672,7 @@ struct ggml_backend_sched {
1389
672
  char * context_buffer;
1390
673
  size_t context_buffer_size;
1391
674
 
1392
- bool debug;
675
+ int debug;
1393
676
  };
1394
677
 
1395
678
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1408,7 +691,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1408
691
  }
1409
692
 
1410
693
  static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1411
- ggml_backend_buffer_t buffer = tensor->buffer;
694
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1412
695
  if (buffer == NULL) {
1413
696
  return -1;
1414
697
  }
@@ -1422,7 +705,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1422
705
  }
1423
706
 
1424
707
  #ifndef NDEBUG
1425
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
708
+ GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1426
709
  __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1427
710
  #endif
1428
711
 
@@ -1441,8 +724,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
1441
724
 
1442
725
  // returns the backend that should be used for the node based on the current locations
1443
726
  static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1444
- // TODO: use supports_op to check if the backend supports the op
1445
-
1446
727
  // assign pre-allocated nodes to their backend
1447
728
  int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1448
729
  if (cur_backend_id != -1) {
@@ -1461,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1461
742
 
1462
743
  if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1463
744
  // since the tensor is pre-allocated, it cannot be moved to another backend
1464
- GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
745
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
746
+ GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
1465
747
  }
1466
748
 
1467
749
  // graph input
@@ -1477,7 +759,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1477
759
  if (src == NULL) {
1478
760
  continue;
1479
761
  }
1480
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
762
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
763
+ // not an ideal solution
764
+ if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1481
765
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1482
766
  // check if a backend with higher prio wants to offload the op
1483
767
  if (src_backend_id == sched->n_backends - 1) {
@@ -1511,32 +795,34 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1511
795
  for (int i = 0; i < graph->n_nodes; i++) {
1512
796
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1513
797
  ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1514
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
798
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1515
799
  sched->splits[cur_split].n_inputs);
1516
800
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1517
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
801
+ GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1518
802
  fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1519
803
  }
1520
- fprintf(stderr, "\n");
804
+ GGML_LOG_DEBUG("\n");
1521
805
  cur_split++;
1522
806
  }
1523
807
  struct ggml_tensor * node = graph->nodes[i];
1524
808
  if (ggml_is_view_op(node->op)) {
1525
809
  continue;
1526
810
  }
1527
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1528
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1529
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1530
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1531
- struct ggml_tensor * src = node->src[j];
1532
- if (src == NULL) {
1533
- continue;
811
+ if (sched->debug > 1) {
812
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
813
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
814
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
815
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
816
+ struct ggml_tensor * src = node->src[j];
817
+ if (src == NULL) {
818
+ continue;
819
+ }
820
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
821
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
822
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1534
823
  }
1535
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1536
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1537
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
824
+ GGML_LOG_DEBUG("\n");
1538
825
  }
1539
- fprintf(stderr, "\n");
1540
826
  }
1541
827
  }
1542
828
 
@@ -1828,11 +1114,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1828
1114
  if (src == NULL) {
1829
1115
  continue;
1830
1116
  }
1831
- // check if a weight is on a different backend
1117
+ // check if a weight is on a different and incompatible backend
1832
1118
  // by starting a new split, the memory of the previously offloaded weights can be reused
1833
1119
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1834
1120
  int src_backend_id = tensor_backend_id(src);
1835
- if (src_backend_id != cur_backend_id) {
1121
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1836
1122
  need_new_split = true;
1837
1123
  break;
1838
1124
  }
@@ -1844,7 +1130,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1844
1130
  int src_backend_id = sched->hv_tensor_backend_ids[id];
1845
1131
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1846
1132
  if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1847
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1848
1133
  need_new_split = true;
1849
1134
  break;
1850
1135
  }
@@ -2050,11 +1335,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
2050
1335
  // the re-allocation may cause the split inputs to be moved to a different address
2051
1336
  ggml_backend_sched_synchronize(sched);
2052
1337
  #ifndef NDEBUG
2053
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1338
+ GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2054
1339
  #endif
2055
1340
  ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2056
1341
  if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2057
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1342
+ GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
2058
1343
  return false;
2059
1344
  }
2060
1345
  }
@@ -2165,11 +1450,12 @@ ggml_backend_sched_t ggml_backend_sched_new(
2165
1450
  bool parallel) {
2166
1451
  GGML_ASSERT(n_backends > 0);
2167
1452
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
2168
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1453
+ GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
2169
1454
 
2170
1455
  struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
2171
1456
 
2172
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1457
+ const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
1458
+ sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
2173
1459
  sched->n_backends = n_backends;
2174
1460
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
2175
1461
 
@@ -2197,6 +1483,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
2197
1483
  sched->backends[b] = backends[b];
2198
1484
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
2199
1485
  GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1486
+
2200
1487
  if (sched->n_copies > 1) {
2201
1488
  for (int c = 0; c < sched->n_copies; c++) {
2202
1489
  sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
@@ -2252,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
2252
1539
 
2253
1540
  ggml_backend_sched_split_graph(sched, measure_graph);
2254
1541
 
1542
+ ggml_backend_sched_synchronize(sched);
1543
+
2255
1544
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2256
1545
  return false;
2257
1546
  }
2258
1547
 
2259
1548
  ggml_backend_sched_reset(sched);
2260
- ggml_backend_sched_synchronize(sched);
2261
1549
 
2262
1550
  return true;
2263
1551
  }
@@ -2448,7 +1736,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2448
1736
  struct ggml_context * ctx_unallocated = ggml_init(params);
2449
1737
 
2450
1738
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2451
- fprintf(stderr, "failed to allocate context for graph copy\n");
1739
+ GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2452
1740
  ggml_hash_set_free(&hash_set);
2453
1741
  free(node_copies);
2454
1742
  free(node_init);
@@ -2471,7 +1759,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2471
1759
  // allocate nodes
2472
1760
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2473
1761
  if (buffer == NULL) {
2474
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
1762
+ GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2475
1763
  ggml_hash_set_free(&hash_set);
2476
1764
  free(node_copies);
2477
1765
  free(node_init);
@@ -2558,3 +1846,154 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
2558
1846
 
2559
1847
  return true;
2560
1848
  }
1849
+
1850
+ // CPU backend - buffer
1851
+
1852
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
1853
+ uintptr_t data = (uintptr_t)buffer->context;
1854
+
1855
+ // align the buffer
1856
+ if (data % TENSOR_ALIGNMENT != 0) {
1857
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
1858
+ }
1859
+
1860
+ return (void *)data;
1861
+ }
1862
+
1863
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1864
+ ggml_aligned_free(buffer->context, buffer->size);
1865
+ }
1866
+
1867
+ static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1868
+ memset((char *)tensor->data + offset, value, size);
1869
+
1870
+ GGML_UNUSED(buffer);
1871
+ }
1872
+
1873
+ static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1874
+ memcpy((char *)tensor->data + offset, data, size);
1875
+
1876
+ GGML_UNUSED(buffer);
1877
+ }
1878
+
1879
+ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1880
+ memcpy(data, (const char *)tensor->data + offset, size);
1881
+
1882
+ GGML_UNUSED(buffer);
1883
+ }
1884
+
1885
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
1886
+ if (ggml_backend_buffer_is_host(src->buffer)) {
1887
+ memcpy(dst->data, src->data, ggml_nbytes(src));
1888
+ return true;
1889
+ }
1890
+ return false;
1891
+
1892
+ GGML_UNUSED(buffer);
1893
+ }
1894
+
1895
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1896
+ memset(buffer->context, value, buffer->size);
1897
+ }
1898
+
1899
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
1900
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
1901
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1902
+ /* .init_tensor = */ NULL, // no initialization required
1903
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1904
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1905
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1906
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1907
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1908
+ /* .reset = */ NULL,
1909
+ };
1910
+
1911
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
1912
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
1913
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1914
+ /* .init_tensor = */ NULL, // no initialization required
1915
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1916
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1917
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1918
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1919
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1920
+ /* .reset = */ NULL,
1921
+ };
1922
+
1923
+ // CPU backend buffer type
1924
+
1925
+ // this buffer type is defined here to make it available to all backends
1926
+
1927
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1928
+ return "CPU";
1929
+
1930
+ GGML_UNUSED(buft);
1931
+ }
1932
+
1933
+ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1934
+ void * data = ggml_aligned_malloc(size);
1935
+
1936
+ if (data == NULL) {
1937
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
1938
+ return NULL;
1939
+ }
1940
+
1941
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
1942
+ }
1943
+
1944
+ static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1945
+ return TENSOR_ALIGNMENT;
1946
+
1947
+ GGML_UNUSED(buft);
1948
+ }
1949
+
1950
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1951
+ return true;
1952
+
1953
+ GGML_UNUSED(buft);
1954
+ }
1955
+
1956
+ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
1957
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1958
+ /* .iface = */ {
1959
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
1960
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1961
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1962
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1963
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1964
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1965
+ },
1966
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1967
+ /* .context = */ NULL,
1968
+ };
1969
+
1970
+ return &ggml_backend_cpu_buffer_type;
1971
+ }
1972
+
1973
+ static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
1974
+ return "CPU_Mapped";
1975
+
1976
+ GGML_UNUSED(buft);
1977
+ }
1978
+
1979
+ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
1980
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1981
+ /* .iface = */ {
1982
+ /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
1983
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1984
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1985
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1986
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1987
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1988
+ },
1989
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1990
+ /* .context = */ NULL,
1991
+ };
1992
+
1993
+ return &ggml_backend_cpu_buffer_type;
1994
+ }
1995
+
1996
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1997
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1998
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1999
+ }