@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -30,6 +30,7 @@
30
30
  #include <cstring>
31
31
  #include <mutex>
32
32
 
33
+ #include "ggml-impl.h"
33
34
  #include "ggml-backend-impl.h"
34
35
  #include "ggml-cann/aclnn_ops.h"
35
36
  #include "ggml-cann/common.h"
@@ -38,68 +39,7 @@
38
39
 
39
40
  #include "ggml-common.h"
40
41
 
41
- /**
42
- * @brief Default logging callback for GGML.
43
- *
44
- * This function is the default logging callback that logs messages to stderr.
45
- *
46
- * @param level The log level.
47
- * @param msg The log message.
48
- * @param user_data User data passed to the callback.
49
- */
50
- static void ggml_cann_default_log_callback(enum ggml_log_level level,
51
- const char* msg, void* user_data) {
52
- GGML_UNUSED(level);
53
- GGML_UNUSED(user_data);
54
- fprintf(stderr, "%s", msg);
55
- }
56
-
57
- ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
58
- void* ggml_cann_log_user_data = NULL;
59
-
60
- GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
61
- void* user_data) {
62
- ggml_cann_log_callback = log_callback;
63
- ggml_cann_log_user_data = user_data;
64
- }
65
-
66
- #define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
67
- #define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
68
- #define GGML_CANN_LOG_ERROR(...) \
69
- ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
-
71
- GGML_ATTRIBUTE_FORMAT(2, 3)
72
-
73
- /**
74
- * @brief Log a message using the current logging callback.
75
- *
76
- * This function formats a log message and passes it to the current logging
77
- * callback.
78
- *
79
- * @param level The log level.
80
- * @param format The format string for the log message.
81
- * @param ... The arguments for the format string.
82
- */
83
- static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
84
- if (ggml_cann_log_callback != NULL) {
85
- va_list args;
86
- va_start(args, format);
87
- char buffer[128];
88
- int len = vsnprintf(buffer, 128, format, args);
89
- if (len < 128) {
90
- ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
91
- } else {
92
- // vsnprintf adds a null terminator
93
- std::vector<char> buffer2(len + 1);
94
- va_end(args);
95
- va_start(args, format);
96
- vsnprintf(&buffer2[0], buffer2.size(), format, args);
97
- ggml_cann_log_callback(level, buffer2.data(),
98
- ggml_cann_log_user_data);
99
- }
100
- va_end(args);
101
- }
102
- }
42
+ #define GGML_CANN_NAME "CANN"
103
43
 
104
44
  /**
105
45
  * @brief Handles CANN errors by printing an error message and aborting.
@@ -115,10 +55,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
115
55
  int32_t id = -1;
116
56
  aclrtGetDevice(&id);
117
57
 
118
- GGML_CANN_LOG_ERROR("CANN error: %s\n", msg);
119
- GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
58
+ GGML_LOG_ERROR("CANN error: %s\n", msg);
59
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
120
60
  file, line);
121
- GGML_CANN_LOG_ERROR(" %s\n", stmt);
61
+ GGML_LOG_ERROR(" %s\n", stmt);
122
62
  // abort with GGML_ASSERT to get a stack trace
123
63
  GGML_ABORT("CANN error");
124
64
  }
@@ -164,7 +104,7 @@ static ggml_cann_device_info ggml_cann_init() {
164
104
  aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
165
105
 
166
106
  if (err != ACL_SUCCESS) {
167
- GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n",
107
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
168
108
  __func__, aclGetRecentErrMsg());
169
109
  return info;
170
110
  }
@@ -314,7 +254,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
314
254
  *actual_size = look_ahead_size;
315
255
  pool_size += look_ahead_size;
316
256
  #ifdef DEBUG_CANN_MALLOC
317
- GGML_CANN_LOG_INFO(
257
+ GGML_LOG_INFO(
318
258
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
319
259
  "requested %u MB\n",
320
260
  __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
@@ -469,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
469
409
  // add to the pool
470
410
  pool_size += reserve_size;
471
411
 
472
- // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB (
412
+ // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
473
413
  // reserved %llu MB)\n",
474
414
  // device, (unsigned long long) (pool_size/1024/1024),
475
415
  // (unsigned long long) (reserve_size/1024/1024));
@@ -482,7 +422,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
482
422
  pool_used += size;
483
423
 
484
424
  #ifdef DEBUG_CANN_MALLOC
485
- GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
425
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
486
426
  (unsigned long long)size, (unsigned long long)ptr);
487
427
  #endif
488
428
  return ptr;
@@ -496,7 +436,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
496
436
  */
497
437
  void free(void* ptr, size_t size) override {
498
438
  #ifdef DEBUG_CANN_MALLOC
499
- GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
439
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
500
440
  (unsigned long long)size, (unsigned long long)ptr);
501
441
  #endif
502
442
 
@@ -549,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
549
489
  ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
550
490
  };
551
491
 
552
- /**
553
- * @brief Retrieve the name associated with a CANN buffer.
554
- *
555
- * This function returns the name of a CANN buffer, which is stored in the
556
- * context of the buffer.
557
- *
558
- * @param buffer The CANN buffer whose name is to be retrieved.
559
- * @return A pointer to a C-string containing the name of the buffer.
560
- */
561
-
562
- GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
563
- ggml_backend_buffer_t buffer) {
564
- return "CANN";
565
-
566
- GGML_UNUSED(buffer);
567
- }
568
-
569
492
  /**
570
493
  * @brief Check if a buffer is a CANN buffer.
571
494
  *
@@ -575,9 +498,10 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
575
498
  * @param buffer The buffer to check.
576
499
  * @return true if the buffer is a CANN buffer, false otherwise.
577
500
  */
578
- GGML_CALL static bool ggml_backend_buffer_is_cann(
501
+ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
502
+ static bool ggml_backend_buffer_is_cann(
579
503
  ggml_backend_buffer_t buffer) {
580
- return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
504
+ return ggml_backend_buft_is_cann(buffer->buft);
581
505
  }
582
506
 
583
507
  /**
@@ -588,7 +512,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
588
512
  *
589
513
  * @param buffer The CANN buffer to free.
590
514
  */
591
- GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
515
+ static void ggml_backend_cann_buffer_free_buffer(
592
516
  ggml_backend_buffer_t buffer) {
593
517
  ggml_backend_cann_buffer_context* ctx =
594
518
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -604,7 +528,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
604
528
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
605
529
  * @return A pointer to the base of the device memory allocated for the buffer.
606
530
  */
607
- GGML_CALL static void* ggml_backend_cann_buffer_get_base(
531
+ static void* ggml_backend_cann_buffer_get_base(
608
532
  ggml_backend_buffer_t buffer) {
609
533
  ggml_backend_cann_buffer_context* ctx =
610
534
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -624,10 +548,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
624
548
  * @param dst Pointer to the destination buffer where transformed data will be
625
549
  * stored.
626
550
  */
627
- GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
628
- const void* src,
629
- void* dst) {
630
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
551
+ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
552
+ const void* src,
553
+ void* dst) {
631
554
 
632
555
  int64_t n_elems = ggml_nelements(tensor);
633
556
  int64_t groups = n_elems / QK4_0;
@@ -677,9 +600,8 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
677
600
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
678
601
  * will be stored.
679
602
  */
680
- GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
603
+ static void ggml_backend_cann_transform_back_q4_0(
681
604
  const ggml_tensor* tensor, void* src, void* dst) {
682
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
683
605
 
684
606
  int64_t n_elems = ggml_nelements(tensor);
685
607
  int64_t groups = n_elems / QK4_0;
@@ -727,9 +649,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
727
649
  * @param dst Pointer to the destination buffer where transformed data will be
728
650
  * stored.
729
651
  */
730
- GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
731
- const void* src,
732
- void* dst) {
652
+ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
653
+ const void* src,
654
+ void* dst) {
733
655
  int64_t n_elems = ggml_nelements(tensor);
734
656
  int64_t groups = n_elems / QK8_0;
735
657
  size_t quant_bytes = n_elems * sizeof(uint8_t);
@@ -761,7 +683,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
761
683
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
762
684
  * will be stored.
763
685
  */
764
- GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
686
+ static void ggml_backend_cann_transform_back_q8_0(
765
687
  const ggml_tensor* tensor, const void* src, void* dst) {
766
688
  int64_t n_elems = ggml_nelements(tensor);
767
689
  int64_t groups = n_elems / QK8_0;
@@ -793,8 +715,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
793
715
  * @param dst Pointer to the destination buffer where transformed data will be
794
716
  * stored.
795
717
  */
796
- GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
797
- const void* src, void* dst) {
718
+ static void ggml_backend_cann_transform(ggml_tensor* tensor,
719
+ const void* src, void* dst) {
798
720
  switch (tensor->type) {
799
721
  case GGML_TYPE_Q4_0:
800
722
  ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -819,7 +741,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
819
741
  * @param dst Pointer to the destination buffer where transformed tensor data
820
742
  * will be stored.
821
743
  */
822
- GGML_CALL static void ggml_backend_cann_transform_back(
744
+ static void ggml_backend_cann_transform_back(
823
745
  const ggml_tensor* tensor, void* src, void* dst) {
824
746
  switch (tensor->type) {
825
747
  case GGML_TYPE_Q4_0:
@@ -842,7 +764,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
842
764
  * @param type The tensor type to check.
843
765
  * @return true if transformation is needed, false otherwise.
844
766
  */
845
- GGML_CALL static bool need_transform(ggml_type type) {
767
+ static bool need_transform(ggml_type type) {
846
768
  switch (type) {
847
769
  case GGML_TYPE_Q4_0:
848
770
  case GGML_TYPE_Q8_0:
@@ -861,7 +783,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
861
783
  * @param buffer The CANN buffer from which to initialize the tensor.
862
784
  * @param tensor Pointer to the tensor to be initialized.
863
785
  */
864
- GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
786
+ static void ggml_backend_cann_buffer_init_tensor(
865
787
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
866
788
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
867
789
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
@@ -897,12 +819,11 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
897
819
  * @param offset Offset in the source data from where to start copying.
898
820
  * @param size Size of the data to be copied, in bytes.
899
821
  */
900
- GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
901
- ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
822
+ static void ggml_backend_cann_buffer_set_tensor(
823
+ ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
902
824
  size_t offset, size_t size) {
903
- // GGML_ASSERT(size == ggml_nbytes(tensor));
904
- ggml_backend_cann_buffer_context* ctx =
905
- (ggml_backend_cann_buffer_context*)buffer->context;
825
+ ggml_backend_cann_buffer_context *ctx =
826
+ (ggml_backend_cann_buffer_context *)buffer->context;
906
827
 
907
828
  ggml_cann_set_device(ctx->device);
908
829
  // TODO: refer to cann(#6017), it use thread's default stream.
@@ -910,22 +831,14 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
910
831
  // Why aclrtSynchronizeDevice?
911
832
 
912
833
  if (!need_transform(tensor->type)) {
913
- ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset,
914
- size, ACL_MEMCPY_HOST_TO_DEVICE));
834
+ ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
835
+ ACL_MEMCPY_HOST_TO_DEVICE));
915
836
  } else {
916
- void* transform_buffer = malloc(size);
917
- ggml_backend_cann_transform(tensor, (const char*)data + offset,
918
- transform_buffer);
919
-
920
- #ifndef NDEBUG
921
- void* check_buffer = malloc(size);
922
- ggml_backend_cann_transform_back(tensor, transform_buffer,
923
- check_buffer);
924
- GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) ==
925
- 0);
926
- free(check_buffer);
927
- #endif
928
- ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
837
+ void *transform_buffer = malloc(size);
838
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
839
+
840
+ ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
841
+ transform_buffer, size,
929
842
  ACL_MEMCPY_HOST_TO_DEVICE));
930
843
  free(transform_buffer);
931
844
  }
@@ -944,24 +857,23 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
944
857
  * @param offset Offset in the destination buffer where to start copying.
945
858
  * @param size Size of the data to be copied, in bytes.
946
859
  */
947
- GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
860
+ static void ggml_backend_cann_buffer_get_tensor(
948
861
  ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
949
862
  size_t offset, size_t size) {
950
- GGML_ASSERT(size == ggml_nbytes(tensor));
951
863
  ggml_backend_cann_buffer_context* ctx =
952
864
  (ggml_backend_cann_buffer_context*)buffer->context;
953
865
 
954
866
  ggml_cann_set_device(ctx->device);
955
867
 
956
868
  if (!need_transform(tensor->type)) {
957
- ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
869
+ ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
958
870
  ACL_MEMCPY_DEVICE_TO_HOST));
959
871
  } else {
960
872
  void* transform_buffer = malloc(size);
961
- ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
873
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size,
874
+ (char*)tensor->data + offset, size,
962
875
  ACL_MEMCPY_DEVICE_TO_HOST));
963
- ggml_backend_cann_transform_back(tensor, transform_buffer,
964
- (char*)data + offset);
876
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
965
877
  free(transform_buffer);
966
878
  }
967
879
  }
@@ -979,7 +891,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
979
891
  * @param dst Pointer to the destination tensor where the data will be copied.
980
892
  * @return true if the copy operation succeeded, false otherwise.
981
893
  */
982
- GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
894
+ static bool ggml_backend_cann_buffer_cpy_tensor(
983
895
  ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
984
896
  if (ggml_backend_buffer_is_cann(src->buffer)) {
985
897
  ggml_backend_cann_buffer_context* src_ctx =
@@ -1021,7 +933,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
1021
933
  * @param buffer The CANN buffer to be cleared.
1022
934
  * @param value The value to which each byte in the buffer will be set.
1023
935
  */
1024
- GGML_CALL static void ggml_backend_cann_buffer_clear(
936
+ static void ggml_backend_cann_buffer_clear(
1025
937
  ggml_backend_buffer_t buffer, uint8_t value) {
1026
938
  ggml_backend_cann_buffer_context* ctx =
1027
939
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -1036,11 +948,11 @@ GGML_CALL static void ggml_backend_cann_buffer_clear(
1036
948
  * This structure defines function pointers to operations that can be performed
1037
949
  * on a CANN buffer within the backend.
1038
950
  */
1039
- static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
1040
- /* .get_name = */ ggml_backend_cann_buffer_get_name,
951
+ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
1041
952
  /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
1042
953
  /* .get_base = */ ggml_backend_cann_buffer_get_base,
1043
954
  /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
955
+ /* .memset_tensor = */ NULL,
1044
956
  /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
1045
957
  /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
1046
958
  /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
@@ -1068,11 +980,12 @@ struct ggml_backend_cann_buffer_type_context {
1068
980
  * @param buft Pointer to the buffer type context.
1069
981
  * @return Const pointer to the C-style string containing the name.
1070
982
  */
1071
- GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
983
+ static const char* ggml_backend_cann_buffer_type_name(
1072
984
  ggml_backend_buffer_type_t buft) {
1073
- return "CANN";
985
+ ggml_backend_cann_buffer_type_context* buft_ctx =
986
+ (ggml_backend_cann_buffer_type_context*)buft->context;
1074
987
 
1075
- GGML_UNUSED(buft);
988
+ return buft_ctx->name.c_str();
1076
989
  }
1077
990
 
1078
991
  /**
@@ -1085,7 +998,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
1085
998
  * @param size Size in bytes of the buffer to allocate.
1086
999
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1087
1000
  */
1088
- GGML_CALL static ggml_backend_buffer_t
1001
+ static ggml_backend_buffer_t
1089
1002
  ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1090
1003
  size_t size) {
1091
1004
  ggml_backend_cann_buffer_type_context* buft_ctx =
@@ -1098,7 +1011,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1098
1011
  void* dev_ptr;
1099
1012
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1100
1013
  if (err != ACL_SUCCESS) {
1101
- GGML_CANN_LOG_ERROR(
1014
+ GGML_LOG_ERROR(
1102
1015
  "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1103
1016
  __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1104
1017
  aclGetRecentErrMsg());
@@ -1124,7 +1037,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1124
1037
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1125
1038
  * buffers).
1126
1039
  */
1127
- GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
1040
+ static size_t ggml_backend_cann_buffer_type_get_alignment(
1128
1041
  ggml_backend_buffer_type_t buft) {
1129
1042
  return 128;
1130
1043
 
@@ -1145,7 +1058,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
1145
1058
  * @return The total allocation size in bytes required for the tensor in the
1146
1059
  * CANN buffer.
1147
1060
  */
1148
- GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1061
+ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1149
1062
  ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1150
1063
  size_t size = ggml_nbytes(tensor);
1151
1064
  int64_t ne0 = tensor->ne[0];
@@ -1171,19 +1084,25 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1171
1084
  GGML_UNUSED(buft);
1172
1085
  }
1173
1086
 
1087
+ static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1088
+ return false;
1089
+
1090
+ GGML_UNUSED(buft);
1091
+ }
1092
+
1174
1093
  /**
1175
1094
  * @brief Interface for managing CANN buffer types in the GGML backend.
1176
1095
  *
1177
1096
  * Provides function pointers for allocating, querying properties, and managing
1178
1097
  * memory for CANN buffer types in the GGML backend.
1179
1098
  */
1180
- static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1099
+ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1181
1100
  /* .get_name = */ ggml_backend_cann_buffer_type_name,
1182
1101
  /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer,
1183
1102
  /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment,
1184
1103
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1185
1104
  /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size,
1186
- /* .is_host = */ NULL,
1105
+ /* .is_host = */ ggml_backend_cann_buffer_type_is_host,
1187
1106
  };
1188
1107
 
1189
1108
  /**
@@ -1196,7 +1115,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1196
1115
  * @return A pointer to the buffer type interface for the specified device, or
1197
1116
  * nullptr if the device index is out of range.
1198
1117
  */
1199
- GGML_CALL ggml_backend_buffer_type_t
1118
+ ggml_backend_buffer_type_t
1200
1119
  ggml_backend_cann_buffer_type(int32_t device) {
1201
1120
  static std::mutex mutex;
1202
1121
  std::lock_guard<std::mutex> lock(mutex);
@@ -1214,6 +1133,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
1214
1133
  for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
1215
1134
  ggml_backend_cann_buffer_types[i] = {
1216
1135
  /* .iface = */ ggml_backend_cann_buffer_type_interface,
1136
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
1217
1137
  /* .context = */
1218
1138
  new ggml_backend_cann_buffer_type_context{
1219
1139
  i, "CANN" + std::to_string(i)},
@@ -1225,6 +1145,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
1225
1145
  return &ggml_backend_cann_buffer_types[device];
1226
1146
  }
1227
1147
 
1148
+ /**
1149
+ * @brief Retrieves the name associated with a CANN host buffer type.
1150
+ *
1151
+ * This function returns the descriptive name associated with the specified
1152
+ * CANN host buffer type context.
1153
+ *
1154
+ * @param buft Pointer to the host buffer type context.
1155
+ * @return Const pointer to the C-style string containing the name.
1156
+ */
1157
+ static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1158
+ return "CANN_Host";
1159
+
1160
+ GGML_UNUSED(buft);
1161
+ }
1162
+
1163
+ /**
1164
+ * @brief Retrieves the name associated with a CANN host buffer.
1165
+ *
1166
+ * This function returns the descriptive name associated with the specified
1167
+ * CANN host buffer context.
1168
+ *
1169
+ * @param buft Pointer to the host buffer context.
1170
+ * @return Const pointer to the C-style string containing the name.
1171
+ */
1172
+ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
1173
+ return "CANN_Host";
1174
+
1175
+ GGML_UNUSED(buffer);
1176
+ }
1177
+
1178
+ /**
1179
+ * @brief Free resources associated with a CANN host buffer.
1180
+ *
1181
+ * This function frees the resources associated with a CANN host buffer, including
1182
+ * its context.
1183
+ *
1184
+ * @param buffer The CANN host buffer to free.
1185
+ */
1186
+ static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
1187
+ ACL_CHECK(aclrtFreeHost(buffer->context));
1188
+ }
1189
+
1190
+ /**
1191
+ * @brief Allocates a new CANN host buffer of the specified size.
1192
+ *
1193
+ * This function allocates a new CANN host buffer with the given size.
1194
+ * @param size Size in bytes of the host buffer to allocate.
1195
+ * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
1196
+ */
1197
+ static void * ggml_cann_host_malloc(size_t size) {
1198
+ if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
1199
+ return nullptr;
1200
+ }
1201
+
1202
+ void * hostPtr = nullptr;
1203
+ aclError err = aclrtMallocHost((void **) &hostPtr, size);
1204
+ if (err != ACL_SUCCESS) {
1205
+
1206
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1207
+ size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1208
+ return nullptr;
1209
+ }
1210
+ return hostPtr;
1211
+ }
1212
+
1213
+ /**
1214
+ * @brief Allocates a new CANN host buffer of the specified type and size.
1215
+ *
1216
+ * @param buft Pointer to the host buffer type context.
1217
+ * @param size Size in bytes of the host buffer to allocate.
1218
+ * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1219
+ */
1220
+ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1221
+ void * hostPtr = ggml_cann_host_malloc(size);
1222
+
1223
+ if (hostPtr == nullptr) {
1224
+ // fallback to cpu buffer
1225
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1226
+ }
1227
+
1228
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
1229
+ buffer->buft = buft;
1230
+ buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1231
+
1232
+ return buffer;
1233
+ }
1234
+
1235
+ /**
1236
+ * @brief Interface for managing CANN host buffer types in the GGML backend.
1237
+ *
1238
+ * Provides function pointers for allocating, querying properties, and managing
1239
+ * memory for CANN buffer types in the GGML backend.
1240
+ */
1241
+ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1242
+ static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1243
+ /* .iface = */ {
1244
+ /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1245
+ /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1246
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1247
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1248
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1249
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1250
+ },
1251
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1252
+ /* .context = */ nullptr,
1253
+ };
1254
+
1255
+ return &ggml_backend_cann_buffer_type_host;
1256
+ }
1257
+
1228
1258
  /**
1229
1259
  * @brief Computes the forward operation for a given tensor using CANN
1230
1260
  * operations.
@@ -1388,7 +1418,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1388
1418
  * @param backend Pointer to the CANN backend structure.
1389
1419
  * @return A pointer to a constant string representing the backend name.
1390
1420
  */
1391
- GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1421
+ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1392
1422
  ggml_backend_cann_context* cann_ctx =
1393
1423
  (ggml_backend_cann_context*)backend->context;
1394
1424
 
@@ -1403,7 +1433,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1403
1433
  *
1404
1434
  * @param backend Pointer to the CANN backend structure to be freed.
1405
1435
  */
1406
- GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
1436
+ static void ggml_backend_cann_free(ggml_backend_t backend) {
1407
1437
  ggml_backend_cann_context* cann_ctx =
1408
1438
  (ggml_backend_cann_context*)backend->context;
1409
1439
  ACL_CHECK(aclrtSynchronizeDevice());
@@ -1418,24 +1448,6 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
1418
1448
  delete backend;
1419
1449
  }
1420
1450
 
1421
- /**
1422
- * @brief Retrieves the default buffer type associated with the CANN backend.
1423
- *
1424
- * This function returns the buffer type specific to the device associated
1425
- * with the CANN backend. It is used to allocate buffers for computations
1426
- * performed by the backend.
1427
- *
1428
- * @param backend Pointer to the CANN backend structure.
1429
- * @return Pointer to the buffer type structure for the CANN backend.
1430
- */
1431
- GGML_CALL static ggml_backend_buffer_type_t
1432
- ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1433
- ggml_backend_cann_context* cann_ctx =
1434
- (ggml_backend_cann_context*)backend->context;
1435
-
1436
- return ggml_backend_cann_buffer_type(cann_ctx->device);
1437
- }
1438
-
1439
1451
  /**
1440
1452
  * @brief Sets tensor data asynchronously in the CANN backend.
1441
1453
  *
@@ -1449,43 +1461,35 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1449
1461
  * @param offset Offset in bytes within the host data.
1450
1462
  * @param size Size of the data to copy in bytes.
1451
1463
  */
1452
- GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1453
- ggml_tensor* tensor,
1454
- const void* data,
1455
- size_t offset,
1456
- size_t size) {
1457
- ggml_backend_cann_context* cann_ctx =
1458
- (ggml_backend_cann_context*)backend->context;
1464
+ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1465
+ ggml_tensor *tensor,
1466
+ const void *data,
1467
+ size_t offset,
1468
+ size_t size) {
1469
+ ggml_backend_cann_context *cann_ctx =
1470
+ (ggml_backend_cann_context *)backend->context;
1459
1471
 
1460
1472
  if (!need_transform(tensor->type)) {
1473
+ ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
1474
+ size, ACL_MEMCPY_HOST_TO_DEVICE,
1475
+ cann_ctx->stream()));
1476
+ } else {
1477
+ void *transform_buffer = malloc(size);
1478
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
1479
+
1461
1480
  ACL_CHECK(aclrtMemcpyAsync(
1462
- tensor->data, size, (const char*)data + offset, size,
1481
+ (char *)tensor->data + offset, size, transform_buffer, size,
1463
1482
  ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
1464
- } else {
1465
- void* transform_buffer = malloc(size);
1466
- ggml_backend_cann_transform(tensor, (const char*)data + offset,
1467
- transform_buffer);
1468
-
1469
- #ifndef NDEBUG
1470
- void* check_buffer = malloc(size);
1471
- ggml_backend_cann_transform_back(tensor, transform_buffer,
1472
- check_buffer);
1473
- GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
1474
- free(check_buffer);
1475
- #endif
1476
- ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
1477
- ACL_MEMCPY_HOST_TO_DEVICE,
1478
- cann_ctx->stream()));
1479
1483
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1480
1484
  free(transform_buffer);
1481
1485
  }
1482
1486
  }
1483
1487
 
1484
- GGML_CALL static void ggml_backend_cann_get_tensor_async(
1485
- ggml_backend_t backend, const ggml_tensor* tensor, void* data,
1488
+ static void ggml_backend_cann_get_tensor_async(
1489
+ ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1486
1490
  size_t offset, size_t size) {
1487
- ggml_backend_cann_context* cann_ctx =
1488
- (ggml_backend_cann_context*)backend->context;
1491
+ ggml_backend_cann_context *cann_ctx =
1492
+ (ggml_backend_cann_context *)backend->context;
1489
1493
  ggml_backend_buffer_t buf =
1490
1494
  tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1491
1495
 
@@ -1493,17 +1497,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
1493
1497
  "unsupported buffer type");
1494
1498
 
1495
1499
  if (!need_transform(tensor->type)) {
1496
- ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
1500
+ ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
1497
1501
  size, ACL_MEMCPY_DEVICE_TO_HOST,
1498
1502
  cann_ctx->stream()));
1499
1503
  } else {
1500
- void* transform_buffer = malloc(size);
1501
- ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
1502
- ACL_MEMCPY_DEVICE_TO_HOST,
1503
- cann_ctx->stream()));
1504
+ void *transform_buffer = malloc(size);
1505
+ ACL_CHECK(aclrtMemcpyAsync(
1506
+ transform_buffer, size, (char *)tensor->data + offset, size,
1507
+ ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
1504
1508
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1505
- ggml_backend_cann_transform_back(tensor, transform_buffer,
1506
- (char*)data + offset);
1509
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1507
1510
  free(transform_buffer);
1508
1511
  }
1509
1512
  }
@@ -1521,7 +1524,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
1521
1524
  * @param dst Pointer to the destination tensor to copy data to.
1522
1525
  * @return true if the copy operation succeeds, false otherwise.
1523
1526
  */
1524
- GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
1527
+ static bool ggml_backend_cann_cpy_tensor_async(
1525
1528
  ggml_backend_t backend_src, ggml_backend_t backend_dst,
1526
1529
  const ggml_tensor* src, ggml_tensor* dst) {
1527
1530
  GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
@@ -1589,7 +1592,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
1589
1592
  *
1590
1593
  * @param backend Pointer to the CANN backend structure to synchronize.
1591
1594
  */
1592
- GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1595
+ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1593
1596
  ggml_backend_cann_context* cann_ctx =
1594
1597
  (ggml_backend_cann_context*)backend->context;
1595
1598
 
@@ -1610,7 +1613,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1610
1613
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
1611
1614
  * completes successfully, otherwise an appropriate error status.
1612
1615
  */
1613
- GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1616
+ static enum ggml_status ggml_backend_cann_graph_compute(
1614
1617
  ggml_backend_t backend, ggml_cgraph* cgraph) {
1615
1618
  ggml_backend_cann_context* cann_ctx =
1616
1619
  (ggml_backend_cann_context*)backend->context;
@@ -1627,7 +1630,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1627
1630
  bool ok = ggml_cann_compute_forward(*cann_ctx, node);
1628
1631
 
1629
1632
  if (!ok) {
1630
- GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1633
+ GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1631
1634
  node->name, ggml_op_name(node->op));
1632
1635
  }
1633
1636
  GGML_ASSERT(ok);
@@ -1648,7 +1651,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1648
1651
  * @return bool Returns true if the operation is supported by the backend,
1649
1652
  * otherwise false.
1650
1653
  */
1651
- GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1654
+ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1652
1655
  const ggml_tensor* op) {
1653
1656
  switch (op->op) {
1654
1657
  case GGML_OP_UNARY:
@@ -1666,10 +1669,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1666
1669
  }
1667
1670
  case GGML_OP_MUL_MAT: {
1668
1671
  switch (op->src[0]->type) {
1669
- // case GGML_TYPE_Q4_0:
1670
1672
  case GGML_TYPE_F16:
1671
1673
  case GGML_TYPE_F32:
1672
1674
  case GGML_TYPE_Q8_0:
1675
+ // TODO: fix me
1676
+ // Current groupsize should not be greater than k-1 in
1677
+ // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
1678
+ case GGML_TYPE_Q4_0:
1673
1679
  return true;
1674
1680
  default:
1675
1681
  return false;
@@ -1694,6 +1700,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1694
1700
  case GGML_TYPE_F32:
1695
1701
  case GGML_TYPE_F16:
1696
1702
  case GGML_TYPE_Q8_0:
1703
+ case GGML_TYPE_Q4_0:
1697
1704
  return true;
1698
1705
  default:
1699
1706
  return false;
@@ -1735,7 +1742,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1735
1742
  return false;
1736
1743
  }
1737
1744
 
1738
- GGML_UNUSED(backend);
1745
+ GGML_UNUSED(dev);
1739
1746
  }
1740
1747
 
1741
1748
  /**
@@ -1753,31 +1760,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
1753
1760
  return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
1754
1761
  }
1755
1762
 
1756
- /**
1757
- * @brief Checks if the CANN backend supports a specific backend buffer type.
1758
- *
1759
- * This function determines whether the CANN backend supports the given backend
1760
- * buffer type by comparing the device context of the backend and buffer type.
1761
- * It returns true if the devices are same between the backend context and
1762
- * buffer type context.
1763
- *
1764
- * @param backend Pointer to the CANN backend.
1765
- * @param buft Pointer to the backend buffer type to check.
1766
- * @return bool Returns true if the CANN backend supports the buffer type,
1767
- * otherwise false.
1768
- */
1769
- GGML_CALL static bool ggml_backend_cann_supports_buft(
1770
- ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1771
- if (ggml_backend_buft_is_cann(buft)) {
1772
- ggml_backend_cann_context * cann_ctx =
1773
- (ggml_backend_cann_context *)backend->context;
1774
- ggml_backend_cann_buffer_type_context * buft_ctx =
1775
- (ggml_backend_cann_buffer_type_context *)buft->context;
1776
- return buft_ctx->device == cann_ctx->device;
1777
- }
1778
- return false;
1779
- }
1780
-
1781
1763
  /**
1782
1764
  * @brief Determines if a tensor operation should be offloaded to the CANN
1783
1765
  * backend.
@@ -1792,54 +1774,14 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
1792
1774
  * @return bool Returns true if the operation should be offloaded, otherwise
1793
1775
  * false.
1794
1776
  */
1795
- GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
1777
+ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
1796
1778
  const ggml_tensor* op) {
1797
1779
  const int min_batch_size = 32;
1798
- GGML_UNUSED(backend);
1780
+ GGML_UNUSED(dev);
1799
1781
 
1800
1782
  return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
1801
1783
  }
1802
1784
 
1803
- /**
1804
- * @brief Creates a new event for the CANN backend.
1805
- *
1806
- * This function initializes a new event for the CANN backend by setting the
1807
- * device and creating an ACL runtime event. The created event is then wrapped
1808
- * in a ggml_backend_event structure and returned.
1809
- *
1810
- * @param backend Pointer to the CANN backend.
1811
- * @return ggml_backend_event_t Returns a pointer to the new event structure.
1812
- */
1813
- static ggml_backend_event_t ggml_backend_cann_event_new(
1814
- ggml_backend_t backend) {
1815
- ggml_backend_cann_context* cann_ctx =
1816
- (ggml_backend_cann_context*)backend->context;
1817
-
1818
- ggml_cann_set_device(cann_ctx->device);
1819
-
1820
- aclrtEvent event;
1821
- ACL_CHECK(aclrtCreateEvent(&event));
1822
-
1823
- return new ggml_backend_event{
1824
- /* .backend = */ backend,
1825
- /* .context = */ event,
1826
- };
1827
- }
1828
-
1829
- /**
1830
- * @brief Frees a CANN backend event.
1831
- *
1832
- * This function destroys the ACL runtime event associated with the given CANN
1833
- * backend event and then deletes the event structure itself.
1834
- *
1835
- * @param event Pointer to the event structure to be freed.
1836
- */
1837
- static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
1838
- ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
1839
-
1840
- delete event;
1841
- }
1842
-
1843
1785
  /**
1844
1786
  * @brief Records an event on the CANN backend stream.
1845
1787
  *
@@ -1848,10 +1790,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
1848
1790
  *
1849
1791
  * @param event Pointer to the event structure to be recorded.
1850
1792
  */
1851
- static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
1793
+ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
1852
1794
  ggml_backend_cann_context* cann_ctx =
1853
- (ggml_backend_cann_context*)event->backend->context;
1854
-
1795
+ (ggml_backend_cann_context*)backend->context;
1855
1796
  ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
1856
1797
  }
1857
1798
 
@@ -1869,8 +1810,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
1869
1810
  ggml_backend_event_t event) {
1870
1811
  ggml_backend_cann_context* cann_ctx =
1871
1812
  (ggml_backend_cann_context*)backend->context;
1872
-
1873
- if (ggml_backend_is_cann(event->backend)) {
1813
+ if (ggml_backend_is_cann(backend)) {
1874
1814
  ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
1875
1815
  (aclrtEvent)event->context));
1876
1816
  } else {
@@ -1878,17 +1818,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
1878
1818
  }
1879
1819
  }
1880
1820
 
1881
- /**
1882
- * @brief Synchronizes the given event on the CANN backend.
1883
- *
1884
- * This function waits for the specified event to complete on the ACL runtime.
1885
- *
1886
- * @param event Pointer to the event structure to be synchronized.
1887
- */
1888
- static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
1889
- ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
1890
- }
1891
-
1892
1821
  /**
1893
1822
  * @brief Structure defining the interface for the CANN backend.
1894
1823
  *
@@ -1896,10 +1825,9 @@ static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
1896
1825
  * supported by the CANN backend, including name retrieval, memory
1897
1826
  * management, tensor operations, synchronization, and event handling.
1898
1827
  */
1899
- static ggml_backend_i ggml_backend_cann_interface = {
1828
+ static const ggml_backend_i ggml_backend_cann_interface = {
1900
1829
  /* .get_name = */ ggml_backend_cann_name,
1901
1830
  /* .free = */ ggml_backend_cann_free,
1902
- /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
1903
1831
  /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
1904
1832
  /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
1905
1833
  /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
@@ -1909,14 +1837,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
1909
1837
  /* .graph_plan_update = */ NULL,
1910
1838
  /* .graph_plan_compute = */ NULL,
1911
1839
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
1912
- /* .supports_op = */ ggml_backend_cann_supports_op,
1913
- /* .supports_buft = */ ggml_backend_cann_supports_buft,
1914
- /* .offload_op = */ ggml_backend_cann_offload_op,
1915
- /* .event_new = */ ggml_backend_cann_event_new,
1916
- /* .event_free = */ ggml_backend_cann_event_free,
1917
1840
  /* .event_record = */ ggml_backend_cann_event_record,
1918
1841
  /* .event_wait = */ ggml_backend_cann_event_wait,
1919
- /* .event_synchronize = */ ggml_backend_cann_event_synchronize,
1920
1842
  };
1921
1843
 
1922
1844
  /**
@@ -1933,91 +1855,274 @@ static ggml_guid_t ggml_backend_cann_guid() {
1933
1855
  return &guid;
1934
1856
  }
1935
1857
 
1936
- GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
1858
+ // backend device
1859
+ struct ggml_backend_cann_device_context {
1860
+ int device;
1861
+ std::string name;
1862
+ std::string description;
1863
+ };
1864
+
1865
+ static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
1866
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1867
+ return ctx->name.c_str();
1868
+ }
1869
+
1870
+ static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
1871
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1872
+ return ctx->description.c_str();
1873
+ }
1874
+
1875
+ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1876
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1877
+ ggml_backend_cann_get_device_memory(ctx->device, free, total);
1878
+ }
1879
+
1880
+ static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
1881
+ GGML_UNUSED(dev);
1882
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
1883
+ }
1884
+
1885
+ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
1886
+ props->name = ggml_backend_cann_device_get_name(dev);
1887
+ props->description = ggml_backend_cann_device_get_description(dev);
1888
+ props->type = ggml_backend_cann_device_get_type(dev);
1889
+ ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
1890
+
1891
+ bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
1892
+
1893
+ props->caps = {
1894
+ /* .async = */ false,
1895
+ /* .host_buffer = */ host_buffer,
1896
+ /* .buffer_from_host_ptr = */ false,
1897
+ /* .events = */ true,
1898
+ };
1899
+ }
1900
+
1901
+ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
1902
+ GGML_UNUSED(params);
1903
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1904
+ return ggml_backend_cann_init(ctx->device);
1905
+ }
1906
+
1907
+ /**
1908
+ * @brief Checks if the CANN backend supports a specific backend buffer type.
1909
+ *
1910
+ * This function determines whether the CANN backend supports the given backend
1911
+ * buffer type by comparing the device context of the backend and buffer type.
1912
+ * It returns true if the devices are same between the backend context and
1913
+ * buffer type context.
1914
+ *
1915
+ * @param backend Pointer to the CANN backend.
1916
+ * @param buft Pointer to the backend buffer type to check.
1917
+ * @return bool Returns true if the CANN backend supports the buffer type,
1918
+ * otherwise false.
1919
+ */
1920
+ static bool ggml_backend_cann_supports_buft(
1921
+ ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1922
+ if (ggml_backend_buft_is_cann(buft)) {
1923
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
1924
+ ggml_backend_cann_buffer_type_context * buft_ctx =
1925
+ (ggml_backend_cann_buffer_type_context *)buft->context;
1926
+ return buft_ctx->device == dev_ctx->device;
1927
+ }
1928
+ return false;
1929
+ }
1930
+
1931
+ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
1932
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1933
+ return ggml_backend_cann_buffer_type(ctx->device);
1934
+ }
1935
+
1936
+ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
1937
+ GGML_UNUSED(dev);
1938
+ return ggml_backend_cann_host_buffer_type();
1939
+ }
1940
+
1941
+ /**
1942
+ * @brief Creates a new event for the CANN backend device.
1943
+ *
1944
+ * This function initializes a new event for the CANN backend by setting the
1945
+ * device and creating an ACL runtime event. The created event is then wrapped
1946
+ * in a ggml_backend_event structure and returned.
1947
+ *
1948
+ * @param backend Pointer to the CANN backend.
1949
+ * @return ggml_backend_event_t Returns a pointer to the new event structure.
1950
+ */
1951
+ static ggml_backend_event_t ggml_backend_cann_device_event_new(
1952
+ ggml_backend_dev_t dev) {
1953
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
1954
+
1955
+ ggml_cann_set_device(dev_ctx->device);
1956
+
1957
+ aclrtEvent event;
1958
+ ACL_CHECK(aclrtCreateEvent(&event));
1959
+
1960
+ return new ggml_backend_event{
1961
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
1962
+ /* .context = */ event,
1963
+ };
1964
+ }
1965
+
1966
+ /**
1967
+ * @brief Frees a CANN backend event.
1968
+ *
1969
+ * This function destroys the ACL runtime event associated with the given CANN
1970
+ * backend event and then deletes the event structure itself.
1971
+ *
1972
+ * @param event Pointer to the event structure to be freed.
1973
+ */
1974
+ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
1975
+ ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
1976
+
1977
+ delete event;
1978
+ GGML_UNUSED(dev);
1979
+ }
1980
+
1981
+ /**
1982
+ * @brief Synchronizes the given event on the CANN backend.
1983
+ *
1984
+ * This function waits for the specified event to complete on the ACL runtime.
1985
+ *
1986
+ * @param event Pointer to the event structure to be synchronized.
1987
+ */
1988
+ static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
1989
+ ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
1990
+
1991
+ GGML_UNUSED(dev);
1992
+ }
1993
+
1994
+ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
1995
+ /* .get_name = */ ggml_backend_cann_device_get_name,
1996
+ /* .get_description = */ ggml_backend_cann_device_get_description,
1997
+ /* .get_memory = */ ggml_backend_cann_device_get_memory,
1998
+ /* .get_type = */ ggml_backend_cann_device_get_type,
1999
+ /* .get_props = */ ggml_backend_cann_device_get_props,
2000
+ /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2001
+ /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
2002
+ /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
2003
+ /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2004
+ /* .supports_op = */ ggml_backend_cann_supports_op,
2005
+ /* .supports_buft = */ ggml_backend_cann_supports_buft,
2006
+ /* .offload_op = */ ggml_backend_cann_offload_op,
2007
+ /* .event_new = */ ggml_backend_cann_device_event_new,
2008
+ /* .event_free = */ ggml_backend_cann_device_event_free,
2009
+ /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
2010
+ };
2011
+
2012
+
2013
+ // backend reg
2014
+ struct ggml_backend_cann_reg_context {
2015
+ std::vector<ggml_backend_dev_t> devices;
2016
+ };
2017
+
2018
+ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
2019
+ GGML_UNUSED(reg);
2020
+ return GGML_CANN_NAME;
2021
+ }
2022
+
2023
+ static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
2024
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2025
+ return ctx->devices.size();
2026
+ }
2027
+
2028
+ static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
2029
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2030
+ GGML_ASSERT(index < ctx->devices.size());
2031
+ return ctx->devices[index];
2032
+ }
2033
+
2034
+ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
2035
+ GGML_UNUSED(reg);
2036
+ GGML_UNUSED(name);
2037
+ // reserved for future use
2038
+ return nullptr;
2039
+ }
2040
+
2041
+ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2042
+ /* .get_name = */ ggml_backend_cann_reg_get_name,
2043
+ /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
2044
+ /* .get_device_get = */ ggml_backend_cann_reg_get_device,
2045
+ /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
2046
+ };
2047
+
2048
+ // backend registry, called only once for cann backend
2049
+ ggml_backend_reg_t ggml_backend_cann_reg() {
2050
+ static ggml_backend_reg reg;
2051
+ static bool initialized = false;
2052
+
2053
+ {
2054
+ static std::mutex mutex;
2055
+ std::lock_guard<std::mutex> lock(mutex);
2056
+ if (!initialized) {
2057
+ aclInit(nullptr);
2058
+ ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
2059
+
2060
+ for (int i = 0; i < ggml_cann_info().device_count; i++) {
2061
+ ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
2062
+ dev_ctx->description = aclrtGetSocName();
2063
+ dev_ctx->device = i;
2064
+ dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2065
+ ggml_cann_set_device(i);
2066
+ ggml_backend_dev_t dev = new ggml_backend_device {
2067
+ /* .interface = */ ggml_backend_cann_device_interface,
2068
+ /* .reg = */ &reg,
2069
+ /* .context = */ dev_ctx
2070
+ };
2071
+ ctx->devices.push_back(dev);
2072
+ }
2073
+
2074
+ reg = ggml_backend_reg {
2075
+ /* .interface = */ ggml_backend_cann_reg_interface,
2076
+ /* .context = */ ctx
2077
+ };
2078
+ }
2079
+
2080
+ initialized = true;
2081
+ }
2082
+
2083
+ return &reg;
2084
+ }
2085
+
2086
+ ggml_backend_t ggml_backend_cann_init(int32_t device) {
1937
2087
  aclInit(nullptr);
1938
2088
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
1939
- GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
2089
+ GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
1940
2090
  return nullptr;
1941
2091
  }
1942
2092
 
1943
2093
  ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
1944
2094
  if (ctx == nullptr) {
1945
- GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
2095
+ GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
1946
2096
  return nullptr;
1947
2097
  }
1948
-
2098
+ ggml_cann_set_device(ctx->device);
1949
2099
  ggml_backend_t cann_backend =
1950
2100
  new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
1951
2101
  /* .interface = */ ggml_backend_cann_interface,
2102
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
1952
2103
  /* .context = */ ctx};
1953
2104
 
1954
2105
  return cann_backend;
1955
2106
  }
1956
2107
 
1957
- GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) {
2108
+ bool ggml_backend_is_cann(ggml_backend_t backend) {
1958
2109
  return backend != NULL &&
1959
2110
  ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
1960
2111
  }
1961
2112
 
1962
- GGML_CALL int32_t ggml_backend_cann_get_device_count() {
2113
+ int32_t ggml_backend_cann_get_device_count() {
1963
2114
  return ggml_cann_info().device_count;
1964
2115
  }
1965
2116
 
1966
- GGML_CALL void ggml_backend_cann_get_device_description(
2117
+ void ggml_backend_cann_get_device_description(
1967
2118
  int32_t device, char* description, size_t description_size) {
1968
2119
  ggml_cann_set_device(device);
1969
2120
  const char* soc_name = aclrtGetSocName();
1970
2121
  snprintf(description, description_size, "%s", soc_name);
1971
2122
  }
1972
2123
 
1973
- GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
1974
- size_t* total) {
2124
+ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2125
+ size_t* total) {
1975
2126
  ggml_cann_set_device(device);
1976
2127
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
1977
2128
  }
1978
-
1979
- // backend registry
1980
- /**
1981
- * @brief Initializes a CANN backend based on the provided parameters.
1982
- *
1983
- * This function initializes a CANN backend using the device index and then
1984
- * initializes the backend using `ggml_backend_cann_init`.
1985
- *
1986
- * @param params Parameters for initialization (unused in this implementation).
1987
- * @param user_data User data containing the device index to initialize the
1988
- * backend.
1989
- * @return ggml_backend_t The initialized CANN backend.
1990
- */
1991
- GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
1992
- void* user_data) {
1993
- ggml_backend_t cann_backend =
1994
- ggml_backend_cann_init((int)(intptr_t)user_data);
1995
- return cann_backend;
1996
-
1997
- GGML_UNUSED(params);
1998
- }
1999
-
2000
- extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
2001
-
2002
- /**
2003
- * @brief Registers CANN (Ascend) devices as backend options.
2004
- *
2005
- * This function initializes ACL, retrieves the number of available CANN
2006
- * devices, and registers each device as a backend option using
2007
- * `ggml_backend_register`. Each device is given a unique name based on
2008
- * `GGML_CANN_NAME` followed by its index.
2009
- *
2010
- * @return int The number of CANN devices registered.
2011
- */
2012
- GGML_CALL int ggml_backend_cann_reg_devices() {
2013
- uint32_t device_count = ggml_backend_cann_get_device_count();
2014
- // initialization
2015
- for (uint32_t i = 0; i < device_count; i++) {
2016
- char name[128];
2017
- snprintf(name, sizeof(name), "CANN%d", i);
2018
- ggml_backend_register(name, ggml_backend_reg_cann_init,
2019
- ggml_backend_cann_buffer_type(i),
2020
- (void*)(intptr_t)i);
2021
- }
2022
- return device_count;
2023
- }