@fugood/llama.node 0.6.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +45 -7
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  27. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  29. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  30. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  31. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  32. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  36. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  37. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  38. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  39. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  40. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  41. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  42. package/src/llama.cpp/cmake/common.cmake +0 -35
  43. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  44. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  45. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  46. package/src/llama.cpp/docs/build.md +0 -561
  47. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  48. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  49. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  50. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  53. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  54. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  56. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  57. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  58. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  59. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  60. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  61. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  62. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  63. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  64. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  68. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  70. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  71. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  72. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  73. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  74. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  75. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  76. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  77. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  79. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  80. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  81. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  82. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  83. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  84. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  85. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  86. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  87. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  88. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  89. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  90. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  91. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  92. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  93. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  94. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  96. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  99. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  100. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  101. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  102. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  103. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  104. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  105. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  106. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  107. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  108. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  109. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  110. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  111. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  112. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  113. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  114. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  115. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  123. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  125. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  127. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  128. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  129. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  130. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  131. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  132. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  135. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  136. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  137. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  138. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  139. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  140. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  141. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  142. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  143. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  144. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  145. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  146. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  147. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  148. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  149. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  150. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  151. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  153. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  155. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  157. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  159. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  161. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  163. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  167. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  169. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  170. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  172. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  173. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  175. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  177. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  183. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  185. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  186. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  187. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  189. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  191. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  195. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  198. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  199. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  200. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  204. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  205. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  206. package/src/llama.cpp/models/.editorconfig +0 -1
  207. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  208. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  212. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  214. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  217. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  225. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  228. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  231. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  233. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  237. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  240. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  242. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  245. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  248. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  256. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  261. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  262. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  263. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  264. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  265. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  266. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  267. package/src/llama.cpp/prompts/assistant.txt +0 -31
  268. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  269. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  270. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  271. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  273. package/src/llama.cpp/prompts/chat.txt +0 -28
  274. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  275. package/src/llama.cpp/prompts/dan.txt +0 -1
  276. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  277. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  278. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  279. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  280. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  281. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  284. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  285. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  286. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  287. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  289. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  290. package/src/llama.cpp/requirements.txt +0 -13
  291. package/src/llama.cpp/scripts/build-info.sh +0 -30
  292. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  293. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  294. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  295. package/src/llama.cpp/tests/get-model.cpp +0 -21
  296. package/src/llama.cpp/tests/get-model.h +0 -2
  297. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  298. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  299. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  300. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  301. package/src/llama.cpp/tests/test-c.c +0 -7
  302. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  303. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  304. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  305. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  306. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  307. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  308. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  309. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  310. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  311. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  312. package/src/llama.cpp/tests/test-log.cpp +0 -39
  313. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  314. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  315. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  316. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  317. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  318. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  319. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  320. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  321. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  322. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  323. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  324. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  325. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  326. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  327. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  328. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  329. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  330. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  331. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  332. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  333. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  334. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  335. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  336. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  337. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  338. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  339. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  340. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  341. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  342. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  343. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  344. package/src/llama.cpp/tools/main/main.cpp +0 -977
  345. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  346. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  347. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  348. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  349. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  350. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  351. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  353. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  354. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  355. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  356. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  357. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  358. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  360. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  361. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  362. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  363. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  364. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  365. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  367. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  368. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  369. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  370. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  371. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  372. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  373. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  374. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  375. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  376. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  377. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,2570 +0,0 @@
1
- /*
2
- * Copyright (c) 2023-2024 The ggml authors
3
- *
4
- * Permission is hereby granted, free of charge, to any person obtaining a copy
5
- * of this software and associated documentation files (the "Software"), to
6
- * deal in the Software without restriction, including without limitation the
7
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8
- * sell copies of the Software, and to permit persons to whom the Software is
9
- * furnished to do so, subject to the following conditions:
10
- *
11
- * The above copyright notice and this permission notice shall be included in
12
- * all copies or substantial portions of the Software.
13
- *
14
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20
- * IN THE SOFTWARE.
21
- */
22
-
23
- #include "ggml-cann.h"
24
-
25
- #include <acl/acl.h>
26
- #include <stdarg.h>
27
-
28
- #include <cmath>
29
- #include <cstdio>
30
- #include <cstring>
31
- #include <mutex>
32
- #include <queue>
33
- #include <chrono>
34
-
35
- #include "ggml-impl.h"
36
- #include "ggml-backend-impl.h"
37
- #include "ggml-cann/aclnn_ops.h"
38
- #include "ggml-cann/common.h"
39
-
40
- #define GGML_COMMON_DECL_C
41
-
42
- #include "ggml-common.h"
43
-
44
- #define GGML_CANN_NAME "CANN"
45
-
46
- /**
47
- * @brief Handles CANN errors by printing an error message and aborting.
48
- *
49
- * @param stmt The statement that caused the error.
50
- * @param func The function in which the error occurred.
51
- * @param file The file in which the error occurred.
52
- * @param line The line number where the error occurred.
53
- * @param msg The error message.
54
- */
55
- [[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
56
- const char* file, int line, const char* msg) {
57
- int32_t id = -1;
58
- aclrtGetDevice(&id);
59
-
60
- GGML_LOG_ERROR("CANN error: %s\n", msg);
61
- GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
62
- file, line);
63
- GGML_LOG_ERROR(" %s\n", stmt);
64
- // abort with GGML_ASSERT to get a stack trace
65
- GGML_ABORT("CANN error");
66
- }
67
-
68
- /**
69
- * @brief Sets the device to be used by CANN.
70
- *
71
- * @param device The device ID to set.
72
- */
73
- void ggml_cann_set_device(const int32_t device) {
74
- // TODO: uncomment these lines after empty context has fixed.
75
- // int current_device;
76
- // ACL_CHECK(aclrtGetDevice(&current_device));
77
-
78
- // if (device == current_device) {
79
- // return;
80
- // }
81
- ACL_CHECK(aclrtSetDevice(device));
82
- }
83
-
84
- /**
85
- * @brief Retrieves the current device ID.
86
- *
87
- * @return The current device ID.
88
- */
89
- int32_t ggml_cann_get_device() {
90
- int32_t id;
91
- ACL_CHECK(aclrtGetDevice(&id));
92
- return id;
93
- }
94
-
95
- /**
96
- * @brief Initialize the CANN device information.
97
- *
98
- * This function initializes the CANN device information by obtaining the
99
- * device count and setting the memory allocation granularity for each device.
100
- *
101
- * @return A structure containing the device information.
102
- */
103
- static ggml_cann_device_info ggml_cann_init() {
104
- ggml_cann_device_info info = {};
105
-
106
- aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
107
-
108
- if (err != ACL_SUCCESS) {
109
- GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
110
- __func__, aclGetRecentErrMsg());
111
- return info;
112
- }
113
-
114
- GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES);
115
-
116
- for (int id = 0; id < info.device_count; ++id) {
117
- aclrtPhysicalMemProp prop = {};
118
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
119
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
120
- prop.memAttr = ACL_HBM_MEM_HUGE;
121
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
122
- prop.location.id = id;
123
- prop.reserve = 0;
124
- err = aclrtMemGetAllocationGranularity(
125
- &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
126
- &info.devices[id].vmm_granularity);
127
- info.devices[id].vmm = err == ACL_SUCCESS;
128
-
129
- size_t free, total;
130
- ggml_backend_cann_get_device_memory(id, &free, &total);
131
- info.devices[id].total_vram = free;
132
- }
133
-
134
- // TODO: add more device info later.
135
- return info;
136
- }
137
-
138
- /**
139
- * @brief Retrieve the CANN device information.
140
- *
141
- * This function returns a reference to a structure containing the CANN device
142
- * information. The device information is initialized once and reused on
143
- * subsequent calls.
144
- *
145
- * @return A reference to the structure containing the device information.
146
- */
147
- const ggml_cann_device_info& ggml_cann_info() {
148
- static ggml_cann_device_info info = ggml_cann_init();
149
- return info;
150
- }
151
-
152
- //#define DEBUG_CANN_MALLOC
153
- /**
154
- * @brief A pool of CANN buffers(priority segment buffer).
155
- *
156
- * This class manages a pool of CANN buffers for a specific device.
157
- */
158
- struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
159
- /**
160
- * @brief The maximum reuse margin for a buffer.
161
- */
162
- static const size_t max_reuse_margin = 1ull << 22; // 4MB
163
-
164
- /**
165
- * @brief The minimum free margin for a buffer.
166
- */
167
- static const size_t min_free_margin = 1ull << 20; // 1MB
168
-
169
- /**
170
- * @brief The alignment for buffer allocation.
171
- */
172
- static const size_t alignment = 128;
173
-
174
- /**
175
- * @brief The device ID associated with this buffer pool.
176
- */
177
- int device;
178
-
179
- /**
180
- * @brief Whether to disable clean during buffer allocation.
181
- */
182
- bool disable_clean = false;
183
-
184
- /**
185
- * @brief Structure representing a CANN buffer.
186
- */
187
- struct ggml_cann_buffer {
188
- void* ptr = nullptr; ///< Pointer to the buffer.
189
- size_t size = 0; ///< Size of the buffer.
190
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
191
-
192
- bool operator>(const ggml_cann_buffer& other) const {
193
- return size > other.size;
194
- }
195
- };
196
-
197
- /**
198
- * @brief Array of CANN buffers in the pool.
199
- */
200
- std::unordered_map<void*, size_t> buffer_pool;
201
- std::priority_queue<ggml_cann_buffer,
202
- std::vector<ggml_cann_buffer>,
203
- std::greater<>> free_buffers ;
204
-
205
- /**
206
- * @brief Total size of all buffers in the pool.
207
- */
208
- size_t pool_size = 0;
209
-
210
- /**
211
- * @brief Constructor to initialize the buffer pool for a specific device.
212
- *
213
- * @param device The device ID to associate with this buffer pool.
214
- */
215
- explicit ggml_cann_pool_buf_prio(int device) : device(device) {
216
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
217
- }
218
-
219
- /**
220
- * @brief Destructor to free all buffers in the pool.
221
- */
222
- ~ggml_cann_pool_buf_prio() {
223
- ggml_cann_set_device(device);
224
- for (auto& [b_ptr, b_size] : buffer_pool) {
225
- aclrtFree(b_ptr);
226
- pool_size -= b_size;
227
- }
228
- buffer_pool.clear();
229
- GGML_ASSERT(pool_size == 0);
230
- }
231
-
232
- /**
233
- * @brief Allocate a buffer of the given size.
234
- *
235
- * @param size The size of the buffer to allocate.
236
- * @param actual_size A pointer to a variable to receive the actual size of
237
- * the allocated buffer.
238
- * @return A pointer to the allocated buffer.
239
- */
240
- void* alloc(size_t size, size_t* actual_size) override {
241
- size = GGML_PAD(size, alignment);
242
- if (size == 0) {
243
- size = alignment;
244
- }
245
-
246
- void* ptr = nullptr;
247
- auto now = std::chrono::steady_clock::now();
248
-
249
- std::vector<ggml_cann_buffer> free_buffers_rest;
250
- free_buffers_rest.reserve(free_buffers.size());
251
- while (!free_buffers.empty()) {
252
- auto b = free_buffers.top();
253
- free_buffers.pop();
254
-
255
- if (b.size >= size) {
256
- // reuse the buffer if the size is enough
257
- const size_t margin = b.size - size;
258
- if (margin <= max_reuse_margin) {
259
- *actual_size = b.size;
260
- ptr = b.ptr;
261
- #ifdef DEBUG_CANN_MALLOC
262
- GGML_LOG_INFO(
263
- "cann pool[%d]: reused %p, "
264
- "pool_size = %5u MB, "
265
- "size = %5u MB, "
266
- "margin = %5u MB\n",
267
- device, b.ptr,
268
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
269
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
270
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
271
- #endif
272
- break;
273
- }
274
- }
275
-
276
- bool should_clean = !disable_clean &&
277
- b.size > min_free_margin &&
278
- std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
279
- if (should_clean) {
280
- // free the buffer if the size is needed to be freed
281
- ACL_CHECK(aclrtFree(b.ptr));
282
- pool_size -= b.size;
283
- buffer_pool.erase(b.ptr);
284
- #ifdef DEBUG_CANN_MALLOC
285
- GGML_LOG_INFO(
286
- "cann pool[%d]: clean %p, "
287
- "pool_size = %5u MB, "
288
- "size = %5u MB\n",
289
- device, b.ptr,
290
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
291
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
292
- #endif
293
- continue;
294
- }
295
- free_buffers_rest.push_back(b);
296
- }
297
- for (ggml_cann_buffer &b : free_buffers_rest) {
298
- free_buffers.push(std::move(b));
299
- }
300
-
301
- #ifdef DEBUG_CANN_MALLOC
302
- GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
303
- #endif
304
- if (ptr != nullptr) {
305
- return ptr;
306
- }
307
-
308
- // allocate a new buffer if no buffer can be reused
309
- ggml_cann_set_device(device);
310
- ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
311
- *actual_size = size;
312
- pool_size += size;
313
- #ifdef DEBUG_CANN_MALLOC
314
- GGML_LOG_INFO(
315
- "cann pool[%d]: allocate %p, "
316
- "pool_size = %5u MB, "
317
- "size = %5u MB\n",
318
- device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
319
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
320
- #endif
321
- buffer_pool.emplace(ptr, size);
322
- return ptr;
323
- }
324
-
325
- /**
326
- * @brief Free a buffer and return it to the pool.
327
- *
328
- * @param ptr Pointer to the buffer to free.
329
- * @param size Size of the buffer to free.
330
- */
331
- void free(void* ptr, size_t size) override {
332
- GGML_UNUSED(size);
333
- auto it = buffer_pool.find(ptr);
334
- if (it == buffer_pool.end()) {
335
- GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
336
- }
337
-
338
- auto now = std::chrono::steady_clock::now();
339
- free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
340
- #ifdef DEBUG_CANN_MALLOC
341
- GGML_LOG_INFO(
342
- "cann pool[%d]: return %p, "
343
- "pool_size = %5u MB\n",
344
- device, ptr,
345
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
346
- #endif
347
- }
348
- };
349
-
350
- /**
351
- * @brief A pool of CANN buffers(segment buffer).
352
- *
353
- * This class manages a pool of CANN buffers for a specific device.
354
- */
355
- struct ggml_cann_pool_buf : public ggml_cann_pool {
356
- /**
357
- * @brief The maximum reuse margin for a buffer.
358
- */
359
- static const size_t max_reuse_margin = 1ull << 22; // 4MB
360
-
361
- /**
362
- * @brief The minimum free margin for a buffer.
363
- */
364
- static const size_t min_free_margin = 1ull << 20; // 1MB
365
-
366
- /**
367
- * @brief The alignment for buffer allocation.
368
- */
369
- static const size_t alignment = 128;
370
-
371
- /**
372
- * @brief The maximum number of buffers in the pool.
373
- */
374
- static const int MAX_BUFFERS = 256;
375
-
376
- /**
377
- * @brief The device ID associated with this buffer pool.
378
- */
379
- int device;
380
-
381
- /**
382
- * @brief Whether to disable clean during buffer allocation.
383
- */
384
- bool disable_clean = false;
385
-
386
- /**
387
- * @brief Structure representing a CANN buffer.
388
- */
389
- struct ggml_cann_buffer {
390
- void* ptr = nullptr; ///< Pointer to the buffer memory.
391
- size_t size = 0; ///< Size of the buffer.
392
- bool used = false; ///< Whether the buffer is currently in use.
393
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
394
- };
395
-
396
- /**
397
- * @brief Array of CANN buffers in the pool.
398
- */
399
- ggml_cann_buffer buffer_pool[MAX_BUFFERS] = {};
400
-
401
- /**
402
- * @brief Total size of all buffers in the pool.
403
- */
404
- size_t pool_size = 0;
405
-
406
- /**
407
- * @brief Constructor to initialize the buffer pool for a specific device.
408
- *
409
- * @param device The device ID to associate with this buffer pool.
410
- */
411
- explicit ggml_cann_pool_buf(int device) : device(device) {
412
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
413
- }
414
-
415
- /**
416
- * @brief Destructor to free all buffers in the pool.
417
- */
418
- ~ggml_cann_pool_buf() {
419
- ggml_cann_set_device(device);
420
- for (int i = 0; i < MAX_BUFFERS; ++i) {
421
- ggml_cann_buffer& b = buffer_pool[i];
422
- if (b.ptr != nullptr) {
423
- aclrtFree(b.ptr);
424
- pool_size -= b.size;
425
- }
426
- }
427
- GGML_ASSERT(pool_size == 0);
428
- }
429
-
430
- /**
431
- * @brief Allocate a buffer of the given size.
432
- *
433
- * @param size The size of the buffer to allocate.
434
- * @param actual_size A pointer to a variable to receive the actual size of
435
- * the allocated buffer.
436
- * @return A pointer to the allocated buffer.
437
- */
438
- void* alloc(size_t size, size_t* actual_size) override {
439
- size = GGML_PAD(size, alignment);
440
- if (size == 0) {
441
- size = alignment;
442
- }
443
-
444
- void* ptr = nullptr;
445
- auto now = std::chrono::steady_clock::now();
446
-
447
- int i = 0;
448
- for (; i < MAX_BUFFERS; ++i) {
449
- ggml_cann_buffer& b = buffer_pool[i];
450
- if (b.ptr == nullptr) {
451
- break;
452
- }
453
- if (b.used) {
454
- continue;
455
- }
456
- if (b.size >= size) {
457
- // reuse the buffer if the size is enough
458
- const size_t margin = b.size - size;
459
- if (margin <= max_reuse_margin) {
460
- *actual_size = b.size;
461
- b.used = true;
462
- ptr = b.ptr;
463
- #ifdef DEBUG_CANN_MALLOC
464
- GGML_LOG_INFO(
465
- "cann pool[%d]: reused %p, "
466
- "pool_size = %5u MB, "
467
- "size = %5u MB, "
468
- "margin = %5u MB\n",
469
- device, b.ptr,
470
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
471
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
472
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
473
- #endif
474
- break;
475
- }
476
- }
477
-
478
- bool should_clean = !disable_clean &&
479
- b.size > min_free_margin &&
480
- std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
481
- if (should_clean) {
482
- // free the buffer if the size is needed to be freed
483
- ACL_CHECK(aclrtFree(b.ptr));
484
- pool_size -= b.size;
485
- #ifdef DEBUG_CANN_MALLOC
486
- GGML_LOG_INFO(
487
- "cann pool[%d]: clean %p, "
488
- "pool_size = %5u MB, "
489
- "size = %5u MB\n",
490
- device, b.ptr,
491
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
492
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
493
- #endif
494
- b.ptr = nullptr;
495
- }
496
- }
497
- if (ptr != nullptr) {
498
- return ptr;
499
- }
500
-
501
- if (i < MAX_BUFFERS) {
502
- // allocate a new buffer if no buffer can be reused
503
- ggml_cann_buffer& b = buffer_pool[i];
504
- ggml_cann_set_device(device);
505
- ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
506
- pool_size += size;
507
- *actual_size = size;
508
- b.size = size;
509
- b.used = true;
510
- if (i >= MAX_BUFFERS - 8) {
511
- GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
512
- }
513
- #ifdef DEBUG_CANN_MALLOC
514
- GGML_LOG_INFO(
515
- "cann pool[%d]: allocate %p, "
516
- "pool_size = %5u MB, "
517
- "size = %5u MB\n",
518
- device, b.ptr,
519
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
520
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
521
- #endif
522
- return b.ptr;
523
- }
524
-
525
- GGML_ABORT("cann pool[%d]: slots full\n", device);
526
- }
527
-
528
- /**
529
- * @brief Free a buffer and return it to the pool.
530
- *
531
- * @param ptr Pointer to the buffer to free.
532
- * @param size Size of the buffer to free.
533
- */
534
- void free(void* ptr, size_t size) override {
535
- GGML_UNUSED(size);
536
- for (int i = 0; i < MAX_BUFFERS; ++i) {
537
- ggml_cann_buffer& b = buffer_pool[i];
538
- if (b.ptr != ptr) {
539
- continue;
540
- }
541
- b.used = false;
542
- b.last_used = std::chrono::steady_clock::now();
543
- #ifdef DEBUG_CANN_MALLOC
544
- GGML_LOG_INFO(
545
- "cann pool[%d]: return %p, "
546
- "pool_size = %5u MB\n",
547
- device, b.ptr,
548
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
549
- #endif
550
- return;
551
- }
552
- GGML_ABORT("cann pool[%d]: slots full\n", device);
553
- }
554
- };
555
-
556
- /**
557
- * @brief A pool of CANN buffers with virtual memory.
558
- *
559
- * This class manages a pool of CANN buffers with virtual memory for a specific
560
- * device.
561
- */
562
- struct ggml_cann_pool_vmm : public ggml_cann_pool {
563
- /**
564
- * @brief The maximum size of the virtual memory pool (32 GB).
565
- */
566
- size_t max_size;
567
-
568
- /**
569
- * @brief The device ID associated with this buffer pool.
570
- */
571
- int device;
572
-
573
- /**
574
- * @brief Pointer to the start of the virtual memory pool.
575
- */
576
- void* pool_addr = 0;
577
-
578
- /**
579
- * @brief Amount of virtual memory used in the pool.
580
- */
581
- size_t pool_used = 0;
582
-
583
- /**
584
- * @brief Total size of the virtual memory pool.
585
- */
586
- size_t pool_size = 0;
587
-
588
- /**
589
- * @brief Allocation granularity for the virtual memory pool.
590
- */
591
- size_t granularity;
592
-
593
- /**
594
- * @brief Handles for the physical memory allocated.
595
- */
596
- std::vector<aclrtDrvMemHandle> handles;
597
-
598
- /**
599
- * @brief Offsets for the mapped memory regions.
600
- */
601
- std::vector<void*> map_offsets;
602
-
603
- /**
604
- * @brief Constructor to initialize the buffer pool with virtual memory for
605
- * a specific device.
606
- *
607
- * @param device The device ID to associate with this buffer pool.
608
- */
609
- explicit ggml_cann_pool_vmm(int device)
610
- : device(device) {
611
- auto dev = ggml_cann_info().devices[device];
612
- granularity = dev.vmm_granularity;
613
- max_size = dev.total_vram;
614
- }
615
-
616
- /**
617
- * @brief Destructor to free all buffers in the virtual memory pool.
618
- */
619
- ~ggml_cann_pool_vmm() {
620
- if (pool_addr != 0) {
621
- for (auto& offset : map_offsets) {
622
- ACL_CHECK(aclrtUnmapMem(offset));
623
- }
624
- for (auto& handle : handles) {
625
- ACL_CHECK(aclrtFreePhysical(handle));
626
- }
627
- ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
628
- }
629
- }
630
-
631
- /**
632
- * @brief Allocate a buffer of the given size in the virtual memory pool.
633
- *
634
- * @param size The size of the buffer to allocate.
635
- * @param actual_size A pointer to a variable to receive the actual size of
636
- * the allocated buffer.
637
- * @return A pointer to the allocated buffer.
638
- */
639
- void* alloc(size_t size, size_t* actual_size) override {
640
- // round up the allocation size to the alignment to ensure that all
641
- // allocations are aligned for all data types
642
- const size_t alignment = 128;
643
- size = GGML_PAD(size, alignment);
644
- if (size == 0) {
645
- size = alignment;
646
- }
647
-
648
- size_t avail = pool_size - pool_used;
649
-
650
- if (size > avail) {
651
- // round up to the next multiple of the granularity
652
- size_t reserve_size = size - avail;
653
- reserve_size = GGML_PAD(reserve_size, granularity);
654
-
655
- GGML_ASSERT(pool_size + reserve_size <= max_size);
656
-
657
- // allocate more physical memory
658
- aclrtPhysicalMemProp prop = {};
659
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
660
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
661
- prop.memAttr = ACL_HBM_MEM_HUGE;
662
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
663
- prop.location.id = device;
664
- prop.reserve = 0;
665
- aclrtDrvMemHandle handle;
666
- ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
667
-
668
- // reserve virtual address space (if not already reserved)
669
- if (pool_addr == 0) {
670
- ACL_CHECK(aclrtReserveMemAddress(
671
- &pool_addr, max_size, 0, NULL, 1));
672
- }
673
-
674
- // map at the end of the pool
675
- ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
676
- handle, 0));
677
-
678
- handles.push_back(handle);
679
- map_offsets.push_back((char*)pool_addr + pool_size);
680
-
681
- // add to the pool
682
- pool_size += reserve_size;
683
-
684
- #ifdef DEBUG_CANN_MALLOC
685
- GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
686
- device, (unsigned long long) (pool_size/1024/1024),
687
- (unsigned long long) (reserve_size/1024/1024));
688
- #endif
689
- }
690
-
691
- GGML_ASSERT(pool_addr != 0);
692
-
693
- void* ptr = (void*)((char*)pool_addr + pool_used);
694
- *actual_size = size;
695
- pool_used += size;
696
-
697
- #ifdef DEBUG_CANN_MALLOC
698
- GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
699
- (unsigned long long)size, (unsigned long long)ptr);
700
- #endif
701
- return ptr;
702
- }
703
-
704
- /**
705
- * @brief Free a buffer and return it to the virtual memory pool.
706
- *
707
- * @param ptr Pointer to the buffer to free.
708
- * @param size Size of the buffer to free.
709
- */
710
- void free(void* ptr, size_t size) override {
711
- #ifdef DEBUG_CANN_MALLOC
712
- GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
713
- (unsigned long long)size, (unsigned long long)ptr);
714
- #endif
715
-
716
- pool_used -= size;
717
-
718
- // all deallocations must be in reverse order of the allocations
719
- GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
720
- }
721
- };
722
-
723
- /**
724
- * @brief Create a new CANN pool for a specific device.
725
- *
726
- * Factory method to create a new CANN pool object based on the device type.
727
- *
728
- * @param device The device ID for which to create the pool.
729
- * @return A unique pointer to the created CANN pool.
730
- */
731
- std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
732
- int device) {
733
- bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
734
- if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
735
- GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
736
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
737
- }
738
- bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
739
- if (enable_buf_prio) {
740
- GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
741
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
742
- }
743
- GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
744
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
745
- }
746
-
747
- // cann buffer
748
- /**
749
- * @brief Context for managing a CANN buffer associated with a specific device.
750
- *
751
- * This structure holds information about a CANN buffer, including the device
752
- * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
753
- */
754
- struct ggml_backend_cann_buffer_context {
755
- int32_t device; ///< The device ID associated with this buffer context.
756
- void* dev_ptr =
757
- nullptr; ///< Pointer to the device memory allocated for the buffer.
758
-
759
- /**
760
- * @brief Constructor to initialize the CANN buffer context.
761
- *
762
- * @param device The device ID associated with this buffer context.
763
- * @param dev_ptr Pointer to the device memory allocated for the buffer.
764
- */
765
- ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
766
- : device(device),
767
- dev_ptr(dev_ptr) {}
768
-
769
- /**
770
- * @brief Destructor to free the device memory allocated for the buffer.
771
- */
772
- ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
773
- };
774
-
775
- /**
776
- * @brief Check if a buffer is a CANN buffer.
777
- *
778
- * This function checks if a given buffer is a CANN buffer by comparing its
779
- * `get_name` function pointer to `ggml_backend_cann_buffer_get_name`.
780
- *
781
- * @param buffer The buffer to check.
782
- * @return true if the buffer is a CANN buffer, false otherwise.
783
- */
784
- static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
785
- static bool ggml_backend_buffer_is_cann(
786
- ggml_backend_buffer_t buffer) {
787
- return ggml_backend_buft_is_cann(buffer->buft);
788
- }
789
-
790
- /**
791
- * @brief Free resources associated with a CANN buffer.
792
- *
793
- * This function frees the resources associated with a CANN buffer, including
794
- * its context.
795
- *
796
- * @param buffer The CANN buffer to free.
797
- */
798
- static void ggml_backend_cann_buffer_free_buffer(
799
- ggml_backend_buffer_t buffer) {
800
- ggml_backend_cann_buffer_context* ctx =
801
- (ggml_backend_cann_buffer_context*)buffer->context;
802
- delete ctx;
803
- }
804
-
805
- /**
806
- * @brief Retrieve the base pointer of a CANN buffer.
807
- *
808
- * This function returns the base pointer of a CANN buffer, which points to the
809
- * device memory allocated for the buffer.
810
- *
811
- * @param buffer The CANN buffer whose base pointer is to be retrieved.
812
- * @return A pointer to the base of the device memory allocated for the buffer.
813
- */
814
- static void* ggml_backend_cann_buffer_get_base(
815
- ggml_backend_buffer_t buffer) {
816
- ggml_backend_cann_buffer_context* ctx =
817
- (ggml_backend_cann_buffer_context*)buffer->context;
818
- return ctx->dev_ptr;
819
- }
820
-
821
- /**
822
- * @brief Transform quantized Q4.0 tensor data into a format suitable for CANN
823
- * processing.
824
- *
825
- * This function transforms quantized Q4.0 tensor data into a format suitable
826
- * for CANN processing. It extracts quantization values and scales from the
827
- * source data and prepares them in a format expected by CANN operations.
828
- *
829
- * @param tensor Pointer to the tensor information.
830
- * @param src Pointer to the source data in Q4.0 format.
831
- * @param dst Pointer to the destination buffer where transformed data will be
832
- * stored.
833
- */
834
- static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
835
- const void* src,
836
- void* dst) {
837
-
838
- int64_t n_elems = ggml_nelements(tensor);
839
- int64_t groups = n_elems / QK4_0;
840
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
841
-
842
- uint8_t* quant_offset = (uint8_t*)dst;
843
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
844
-
845
- for (int i = 0; i < groups; i++) {
846
- const block_q4_0* group =
847
- (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
848
- *scale_offset = group->d;
849
- scale_offset++;
850
-
851
- // 0-15
852
- for (int j = 0; j < QK4_0 / 2; j += 2) {
853
- (*quant_offset) = (group->qs[j] & 0x0F);
854
- (*quant_offset) |= ((group->qs[j + 1] << 4));
855
- quant_offset++;
856
- }
857
-
858
- // 16-31
859
- for (int j = 0; j < QK4_0 / 2; j += 2) {
860
- (*quant_offset) = (group->qs[j] >> 4);
861
- (*quant_offset) |= (group->qs[j + 1] & 0xF0);
862
- quant_offset++;
863
- }
864
- }
865
-
866
- // put (uint4b_t -8) into int4b_t
867
- for (quant_offset = (uint8_t*)dst;
868
- quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
869
- (*quant_offset) ^= 0x88;
870
- }
871
- }
872
-
873
- /**
874
- * @brief Transform CANN processed data back into quantized Q4.0 format.
875
- *
876
- * This function transforms CANN processed data back into quantized Q4.0 format.
877
- * It reverses the transformation performed by
878
- * ggml_backend_cann_transform_q4_0(), converting the data back into its
879
- * original quantized form.
880
- *
881
- * @param tensor Pointer to the tensor information.
882
- * @param src Pointer to the source buffer containing transformed data.
883
- * @param dst Pointer to the destination buffer where the Q4.0 formatted data
884
- * will be stored.
885
- */
886
- static void ggml_backend_cann_transform_back_q4_0(
887
- const ggml_tensor* tensor, void* src, void* dst) {
888
-
889
- int64_t n_elems = ggml_nelements(tensor);
890
- int64_t groups = n_elems / QK4_0;
891
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
892
-
893
- uint8_t* quant_offset = (uint8_t*)src;
894
- uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
895
-
896
- for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
897
- (*quant_offset) ^= 0x88;
898
- }
899
- quant_offset = (uint8_t*)src;
900
-
901
- for (int i = 0; i < groups; i++) {
902
- block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
903
- group->d = *scale_offset;
904
- scale_offset++;
905
-
906
- // 0-15
907
- for (int j = 0; j < QK4_0 / 2; j += 2) {
908
- group->qs[j] = ((*quant_offset) & 0x0F);
909
- group->qs[j + 1] = ((*quant_offset) >> 4);
910
- quant_offset++;
911
- }
912
-
913
- // 16-31
914
- for (int j = 0; j < QK4_0 / 2; j += 2) {
915
- group->qs[j] |= ((*quant_offset) << 4);
916
- group->qs[j + 1] |= ((*quant_offset) & 0xF0);
917
- quant_offset++;
918
- }
919
- }
920
- }
921
-
922
- /**
923
- * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN
924
- * processing.
925
- *
926
- * This function transforms quantized Q8.0 tensor data into a format suitable
927
- * for CANN processing. It extracts quantization values and scales from the
928
- * source data and prepares them in a format expected by CANN operations.
929
- *
930
- * @param tensor Pointer to the tensor information.
931
- * @param src Pointer to the source data in Q8.0 format.
932
- * @param dst Pointer to the destination buffer where transformed data will be
933
- * stored.
934
- */
935
- static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
936
- const void* src,
937
- void* dst) {
938
- int64_t n_elems = ggml_nelements(tensor);
939
- int64_t groups = n_elems / QK8_0;
940
- size_t quant_bytes = n_elems * sizeof(uint8_t);
941
-
942
- uint8_t* quant_offset = (uint8_t*)dst;
943
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
944
-
945
- for (int i = 0; i < groups; i++) {
946
- const block_q8_0* group =
947
- (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
948
- *scale_offset = group->d;
949
- scale_offset++;
950
- size_t group_quant_size = QK8_0 * sizeof(uint8_t);
951
- memcpy(quant_offset, group->qs, group_quant_size);
952
- quant_offset += group_quant_size;
953
- }
954
- }
955
-
956
- /**
957
- * @brief Transform CANN processed data back into quantized Q8.0 format.
958
- *
959
- * This function transforms CANN processed data back into quantized Q8.0 format.
960
- * It reverses the transformation performed by
961
- * ggml_backend_cann_transform_q8_0(), converting the data back into its
962
- * original quantized form.
963
- *
964
- * @param tensor Pointer to the tensor information.
965
- * @param src Pointer to the source buffer containing transformed data.
966
- * @param dst Pointer to the destination buffer where the Q8.0 formatted data
967
- * will be stored.
968
- */
969
- static void ggml_backend_cann_transform_back_q8_0(
970
- const ggml_tensor* tensor, const void* src, void* dst) {
971
- int64_t n_elems = ggml_nelements(tensor);
972
- int64_t groups = n_elems / QK8_0;
973
- size_t quant_bytes = n_elems * sizeof(uint8_t);
974
-
975
- const uint8_t* quant_offset = (const uint8_t*)src;
976
- const uint16_t* scale_offset =
977
- (const uint16_t*)((const char*)src + quant_bytes);
978
-
979
- for (int i = 0; i < groups; i++) {
980
- block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
981
- group->d = *scale_offset;
982
- scale_offset++;
983
- size_t group_quant_size = QK8_0 * sizeof(uint8_t);
984
- memcpy(group->qs, quant_offset, group_quant_size);
985
- quant_offset += group_quant_size;
986
- }
987
- }
988
-
989
- /**
990
- * @brief Transform tensor data based on its type for CANN processing.
991
- *
992
- * This function transforms tensor data based on its quantization type for CANN
993
- * processing. It dispatches the transformation based on the tensor's type to
994
- * specialized functions handling Q4.0 and Q8.0 formats.
995
- *
996
- * @param tensor Pointer to the tensor information.
997
- * @param src Pointer to the source data to be transformed.
998
- * @param dst Pointer to the destination buffer where transformed data will be
999
- * stored.
1000
- */
1001
- static void ggml_backend_cann_transform(ggml_tensor* tensor,
1002
- const void* src, void* dst) {
1003
- switch (tensor->type) {
1004
- case GGML_TYPE_Q4_0:
1005
- ggml_backend_cann_transform_q4_0(tensor, src, dst);
1006
- break;
1007
- case GGML_TYPE_Q8_0:
1008
- ggml_backend_cann_transform_q8_0(tensor, src, dst);
1009
- break;
1010
- default:
1011
- break;
1012
- }
1013
- }
1014
-
1015
- /**
1016
- * @brief Transform CANN processed data back into tensor data based on its type.
1017
- *
1018
- * This function transforms CANN processed data back into tensor data based on
1019
- * its quantization type for Q4.0 and Q8.0 formats. It dispatches the
1020
- * transformation based on the tensor's type to specialized functions.
1021
- *
1022
- * @param tensor Pointer to the tensor information.
1023
- * @param src Pointer to the source data containing CANN processed data.
1024
- * @param dst Pointer to the destination buffer where transformed tensor data
1025
- * will be stored.
1026
- */
1027
- static void ggml_backend_cann_transform_back(
1028
- const ggml_tensor* tensor, void* src, void* dst) {
1029
- switch (tensor->type) {
1030
- case GGML_TYPE_Q4_0:
1031
- ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
1032
- break;
1033
- case GGML_TYPE_Q8_0:
1034
- ggml_backend_cann_transform_back_q8_0(tensor, src, dst);
1035
- break;
1036
- default:
1037
- break;
1038
- }
1039
- }
1040
-
1041
- /**
1042
- * @brief Check if transformation is needed for a given tensor type.
1043
- *
1044
- * This function checks if transformation is needed for a given tensor type
1045
- * to prepare data for CANN processing.
1046
- *
1047
- * @param type The tensor type to check.
1048
- * @return true if transformation is needed, false otherwise.
1049
- */
1050
- static bool need_transform(ggml_type type) {
1051
- switch (type) {
1052
- case GGML_TYPE_Q4_0:
1053
- case GGML_TYPE_Q8_0:
1054
- return true;
1055
- default:
1056
- return false;
1057
- }
1058
- }
1059
-
1060
- /**
1061
- * @brief Initialize a tensor using data from a CANN buffer.
1062
- *
1063
- * This function initializes a tensor using data from a CANN buffer.
1064
- * It handles special cases such as views and quantization.
1065
- *
1066
- * @param buffer The CANN buffer from which to initialize the tensor.
1067
- * @param tensor Pointer to the tensor to be initialized.
1068
- */
1069
- static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1070
- ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
1071
- if (tensor->view_src != NULL && tensor->view_offs == 0) {
1072
- GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
1073
- return GGML_STATUS_SUCCESS;
1074
- }
1075
-
1076
- // TODO: cann backend doesn't support quantized yet. Just leave the code
1077
- // here.
1078
- if (ggml_is_quantized(tensor->type)) {
1079
- // Initialize padding to 0 to avoid possible NaN values
1080
- size_t original_size = ggml_nbytes(tensor);
1081
- size_t padded_size =
1082
- ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
1083
-
1084
- if (padded_size > original_size && tensor->view_src == nullptr) {
1085
- size_t memset_size = padded_size - original_size;
1086
- ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
1087
- memset_size, 0, memset_size));
1088
- }
1089
- }
1090
- return GGML_STATUS_SUCCESS;
1091
- }
1092
-
1093
- // TODO: need handle tensor which has paddings.
1094
- /**
1095
- * @brief Set tensor data in a CANN buffer.
1096
- *
1097
- * This function sets tensor data in a CANN buffer, handling transformations
1098
- * if needed based on the tensor's type.
1099
- *
1100
- * @param buffer The CANN buffer where the tensor data will be set.
1101
- * @param tensor Pointer to the tensor whose data will be set.
1102
- * @param data Pointer to the source data to be copied into the tensor.
1103
- * @param offset Offset in the source data from where to start copying.
1104
- * @param size Size of the data to be copied, in bytes.
1105
- */
1106
- static void ggml_backend_cann_buffer_set_tensor(
1107
- ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
1108
- size_t offset, size_t size) {
1109
- ggml_backend_cann_buffer_context *ctx =
1110
- (ggml_backend_cann_buffer_context *)buffer->context;
1111
-
1112
- ggml_cann_set_device(ctx->device);
1113
- // TODO: refer to cann(#6017), it use thread's default stream.
1114
- // For acl, synchronous functions use this default stream.
1115
- // Why aclrtSynchronizeDevice?
1116
-
1117
- if (!need_transform(tensor->type)) {
1118
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
1119
- ACL_MEMCPY_HOST_TO_DEVICE));
1120
- } else {
1121
- void *transform_buffer = malloc(size);
1122
- ggml_backend_cann_transform(tensor, data, transform_buffer);
1123
-
1124
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
1125
- transform_buffer, size,
1126
- ACL_MEMCPY_HOST_TO_DEVICE));
1127
- free(transform_buffer);
1128
- }
1129
- }
1130
-
1131
- /**
1132
- * @brief Get tensor data from a CANN buffer.
1133
- *
1134
- * This function retrieves tensor data from a CANN buffer, handling
1135
- * transformations if needed based on the tensor's type.
1136
- *
1137
- * @param buffer The CANN buffer from which to retrieve tensor data.
1138
- * @param tensor Pointer to the tensor whose data will be retrieved.
1139
- * @param data Pointer to the destination buffer where the tensor data will be
1140
- * copied.
1141
- * @param offset Offset in the destination buffer where to start copying.
1142
- * @param size Size of the data to be copied, in bytes.
1143
- */
1144
- static void ggml_backend_cann_buffer_get_tensor(
1145
- ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
1146
- size_t offset, size_t size) {
1147
- ggml_backend_cann_buffer_context* ctx =
1148
- (ggml_backend_cann_buffer_context*)buffer->context;
1149
-
1150
- ggml_cann_set_device(ctx->device);
1151
-
1152
- if (!need_transform(tensor->type)) {
1153
- ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
1154
- ACL_MEMCPY_DEVICE_TO_HOST));
1155
- } else {
1156
- void* transform_buffer = malloc(size);
1157
- ACL_CHECK(aclrtMemcpy(transform_buffer, size,
1158
- (char*)tensor->data + offset, size,
1159
- ACL_MEMCPY_DEVICE_TO_HOST));
1160
- ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1161
- free(transform_buffer);
1162
- }
1163
- }
1164
-
1165
- /**
1166
- * @brief Copy tensor data between CANN buffers if possible.
1167
- *
1168
- * This function copies tensor data between CANN buffers if the source and
1169
- * destination buffers are CANN buffers and they meet the necessary conditions
1170
- * (same device or devices can access each other).
1171
- *
1172
- * @param buffer The destination CANN buffer where the tensor data will be
1173
- * copied.
1174
- * @param src Pointer to the source tensor whose data will be copied.
1175
- * @param dst Pointer to the destination tensor where the data will be copied.
1176
- * @return true if the copy operation succeeded, false otherwise.
1177
- */
1178
- static bool ggml_backend_cann_buffer_cpy_tensor(
1179
- ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
1180
- if (ggml_backend_buffer_is_cann(src->buffer)) {
1181
- ggml_backend_cann_buffer_context* src_ctx =
1182
- (ggml_backend_cann_buffer_context*)src->buffer->context;
1183
- ggml_backend_cann_buffer_context* dst_ctx =
1184
- (ggml_backend_cann_buffer_context*)buffer->context;
1185
-
1186
- size_t memcpy_size = ggml_nbytes(src);
1187
- // Same device.
1188
- if (src_ctx->device == dst_ctx->device) {
1189
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
1190
- (const char*)src->data, memcpy_size,
1191
- ACL_MEMCPY_DEVICE_TO_DEVICE));
1192
- return true;
1193
- } else {
1194
- // Different device but can access by peer.
1195
- int32_t canAccessPeer = 0;
1196
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
1197
- dst_ctx->device));
1198
- if (canAccessPeer) {
1199
- ggml_cann_set_device(src_ctx->device);
1200
- ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
1201
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
1202
- (const char*)src->data, memcpy_size,
1203
- ACL_MEMCPY_DEVICE_TO_DEVICE));
1204
- return true;
1205
- }
1206
- }
1207
- }
1208
- return false;
1209
- }
1210
-
1211
- /**
1212
- * @brief Clear a CANN buffer by setting all its memory to a specified value.
1213
- *
1214
- * This function clears a CANN buffer by setting all its memory to a specified
1215
- * value.
1216
- *
1217
- * @param buffer The CANN buffer to be cleared.
1218
- * @param value The value to which each byte in the buffer will be set.
1219
- */
1220
- static void ggml_backend_cann_buffer_clear(
1221
- ggml_backend_buffer_t buffer, uint8_t value) {
1222
- ggml_backend_cann_buffer_context* ctx =
1223
- (ggml_backend_cann_buffer_context*)buffer->context;
1224
-
1225
- ggml_cann_set_device(ctx->device);
1226
- ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
1227
- }
1228
-
1229
- /**
1230
- * @brief Interface for a CANN buffer in the backend.
1231
- *
1232
- * This structure defines function pointers to operations that can be performed
1233
- * on a CANN buffer within the backend.
1234
- */
1235
- static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
1236
- /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
1237
- /* .get_base = */ ggml_backend_cann_buffer_get_base,
1238
- /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
1239
- /* .memset_tensor = */ NULL,
1240
- /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
1241
- /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
1242
- /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
1243
- /* .clear = */ ggml_backend_cann_buffer_clear,
1244
- /* .reset = */ NULL,
1245
- };
1246
-
1247
- // cann buffer type
1248
- /**
1249
- * @brief Structure representing context information for a specific backend
1250
- * buffer type.
1251
- */
1252
- struct ggml_backend_cann_buffer_type_context {
1253
- int32_t
1254
- device; /**< Device identifier associated with the buffer context. */
1255
- std::string name; /**< Name associated with the buffer context. */
1256
- };
1257
-
1258
- /**
1259
- * @brief Retrieves the name associated with a CANN buffer type.
1260
- *
1261
- * This function returns the descriptive name associated with the specified
1262
- * CANN buffer type context.
1263
- *
1264
- * @param buft Pointer to the buffer type context.
1265
- * @return Const pointer to the C-style string containing the name.
1266
- */
1267
- static const char* ggml_backend_cann_buffer_type_name(
1268
- ggml_backend_buffer_type_t buft) {
1269
- ggml_backend_cann_buffer_type_context* buft_ctx =
1270
- (ggml_backend_cann_buffer_type_context*)buft->context;
1271
-
1272
- return buft_ctx->name.c_str();
1273
- }
1274
-
1275
- /**
1276
- * @brief Allocates a new CANN buffer of the specified type and size.
1277
- *
1278
- * This function allocates a new CANN buffer on the specified device with the
1279
- * given size.
1280
- *
1281
- * @param buft Pointer to the buffer type context.
1282
- * @param size Size in bytes of the buffer to allocate.
1283
- * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1284
- */
1285
- static ggml_backend_buffer_t
1286
- ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1287
- size_t size) {
1288
- ggml_backend_cann_buffer_type_context* buft_ctx =
1289
- (ggml_backend_cann_buffer_type_context*)buft->context;
1290
-
1291
- ggml_cann_set_device(buft_ctx->device);
1292
-
1293
- const size_t alignment = 128;
1294
- size = GGML_PAD(size, alignment);
1295
- if (size == 0) {
1296
- size = alignment;
1297
- }
1298
- void* dev_ptr;
1299
- aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1300
- if (err != ACL_SUCCESS) {
1301
- GGML_LOG_ERROR(
1302
- "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1303
- __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1304
- aclGetRecentErrMsg());
1305
- return nullptr;
1306
- }
1307
-
1308
- ggml_backend_cann_buffer_context* ctx =
1309
- new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
1310
-
1311
- return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
1312
- ctx, size);
1313
- }
1314
-
1315
- /**
1316
- * @brief Retrieves the memory alignment requirement for CANN buffers of this
1317
- * type.
1318
- *
1319
- * This function returns the alignment requirement in bytes for memory allocated
1320
- * by the CANN buffer type.
1321
- *
1322
- * @param buft Pointer to the buffer type context (unused in this
1323
- * implementation).
1324
- * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1325
- * buffers).
1326
- */
1327
- static size_t ggml_backend_cann_buffer_type_get_alignment(
1328
- ggml_backend_buffer_type_t buft) {
1329
- return 128;
1330
-
1331
- GGML_UNUSED(buft);
1332
- }
1333
-
1334
- /**
1335
- * @brief Calculates the allocation size required for a tensor in a CANN buffer.
1336
- *
1337
- * Computes the total allocation size needed for storing the tensor's data in a
1338
- * CANN buffer, considering any necessary padding or adjustments for quantized
1339
- * types.
1340
- *
1341
- * @param buft Pointer to the buffer type context (unused in this
1342
- * implementation).
1343
- * @param tensor Pointer to the tensor for which the allocation size is
1344
- * calculated.
1345
- * @return The total allocation size in bytes required for the tensor in the
1346
- * CANN buffer.
1347
- */
1348
- static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1349
- ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1350
- size_t size = ggml_nbytes(tensor);
1351
- int64_t ne0 = tensor->ne[0];
1352
-
1353
- // last line must bigger than 32, because every single op deal at
1354
- // least 32 bytes.
1355
- // TODO: quantized type?
1356
- // int64_t line_size = ne0 * ggml_element_size(tensor);
1357
- // int64_t line_size_align_32 = (line_size + 31) & ~31;
1358
- // size += (line_size_align_32 - line_size);
1359
-
1360
- // TODO: not support quantized yet.
1361
- // TODO: consider un-continue tensor.
1362
- if (ggml_is_quantized(tensor->type)) {
1363
- if (ne0 % MATRIX_ROW_PADDING != 0) {
1364
- size += ggml_row_size(
1365
- tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1366
- }
1367
- }
1368
-
1369
- return size;
1370
-
1371
- GGML_UNUSED(buft);
1372
- }
1373
-
1374
- static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1375
- return false;
1376
-
1377
- GGML_UNUSED(buft);
1378
- }
1379
-
1380
- /**
1381
- * @brief Interface for managing CANN buffer types in the GGML backend.
1382
- *
1383
- * Provides function pointers for allocating, querying properties, and managing
1384
- * memory for CANN buffer types in the GGML backend.
1385
- */
1386
- static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1387
- /* .get_name = */ ggml_backend_cann_buffer_type_name,
1388
- /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer,
1389
- /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment,
1390
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1391
- /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size,
1392
- /* .is_host = */ ggml_backend_cann_buffer_type_is_host,
1393
- };
1394
-
1395
- /**
1396
- * @brief Retrieves the CANN buffer type for a specified device.
1397
- *
1398
- * This function initializes and returns the buffer type interface associated
1399
- * with the given device. It ensures thread-safe access using a mutex.
1400
- *
1401
- * @param device The device index for which to retrieve the buffer type.
1402
- * @return A pointer to the buffer type interface for the specified device, or
1403
- * nullptr if the device index is out of range.
1404
- */
1405
- ggml_backend_buffer_type_t
1406
- ggml_backend_cann_buffer_type(int32_t device) {
1407
- static std::mutex mutex;
1408
- std::lock_guard<std::mutex> lock(mutex);
1409
-
1410
- if (device >= ggml_backend_cann_get_device_count()) {
1411
- return nullptr;
1412
- }
1413
-
1414
- static ggml_backend_buffer_type
1415
- ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
1416
-
1417
- static bool ggml_backend_cann_buffer_type_initialized = false;
1418
-
1419
- if (!ggml_backend_cann_buffer_type_initialized) {
1420
- for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
1421
- ggml_backend_cann_buffer_types[i] = {
1422
- /* .iface = */ ggml_backend_cann_buffer_type_interface,
1423
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
1424
- /* .context = */
1425
- new ggml_backend_cann_buffer_type_context{
1426
- i, "CANN" + std::to_string(i)},
1427
- };
1428
- }
1429
- ggml_backend_cann_buffer_type_initialized = true;
1430
- }
1431
-
1432
- return &ggml_backend_cann_buffer_types[device];
1433
- }
1434
-
1435
- /**
1436
- * @brief Retrieves the name associated with a CANN host buffer type.
1437
- *
1438
- * This function returns the descriptive name associated with the specified
1439
- * CANN host buffer type context.
1440
- *
1441
- * @param buft Pointer to the host buffer type context.
1442
- * @return Const pointer to the C-style string containing the name.
1443
- */
1444
- static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1445
- return "CANN_Host";
1446
-
1447
- GGML_UNUSED(buft);
1448
- }
1449
-
1450
- /**
1451
- * @brief Retrieves the name associated with a CANN host buffer.
1452
- *
1453
- * This function returns the descriptive name associated with the specified
1454
- * CANN host buffer context.
1455
- *
1456
- * @param buft Pointer to the host buffer context.
1457
- * @return Const pointer to the C-style string containing the name.
1458
- */
1459
- static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
1460
- return "CANN_Host";
1461
-
1462
- GGML_UNUSED(buffer);
1463
- }
1464
-
1465
- /**
1466
- * @brief Free resources associated with a CANN host buffer.
1467
- *
1468
- * This function frees the resources associated with a CANN host buffer, including
1469
- * its context.
1470
- *
1471
- * @param buffer The CANN host buffer to free.
1472
- */
1473
- static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
1474
- ACL_CHECK(aclrtFreeHost(buffer->context));
1475
- }
1476
-
1477
- /**
1478
- * @brief Allocates a new CANN host buffer of the specified size.
1479
- *
1480
- * This function allocates a new CANN host buffer with the given size.
1481
- * @param size Size in bytes of the host buffer to allocate.
1482
- * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
1483
- */
1484
- static void * ggml_cann_host_malloc(size_t size) {
1485
- if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
1486
- return nullptr;
1487
- }
1488
-
1489
- const size_t alignment = 128;
1490
- size = GGML_PAD(size, alignment);
1491
- if (size == 0) {
1492
- size = alignment;
1493
- }
1494
-
1495
- void * hostPtr = nullptr;
1496
- aclError err = aclrtMallocHost((void **) &hostPtr, size);
1497
- if (err != ACL_SUCCESS) {
1498
- GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1499
- size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1500
- return nullptr;
1501
- }
1502
- return hostPtr;
1503
- }
1504
-
1505
- /**
1506
- * @brief Allocates a new CANN host buffer of the specified type and size.
1507
- *
1508
- * @param buft Pointer to the host buffer type context.
1509
- * @param size Size in bytes of the host buffer to allocate.
1510
- * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1511
- */
1512
- static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1513
- void * hostPtr = ggml_cann_host_malloc(size);
1514
-
1515
- if (hostPtr == nullptr) {
1516
- // fallback to cpu buffer
1517
- return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1518
- }
1519
-
1520
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
1521
- buffer->buft = buft;
1522
- buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1523
-
1524
- return buffer;
1525
- }
1526
-
1527
- /**
1528
- * @brief Interface for managing CANN host buffer types in the GGML backend.
1529
- *
1530
- * Provides function pointers for allocating, querying properties, and managing
1531
- * memory for CANN buffer types in the GGML backend.
1532
- */
1533
- ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1534
- static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1535
- /* .iface = */ {
1536
- /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1537
- /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1538
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1539
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1540
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1541
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1542
- },
1543
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1544
- /* .context = */ nullptr,
1545
- };
1546
-
1547
- return &ggml_backend_cann_buffer_type_host;
1548
- }
1549
-
1550
- /**
1551
- * @brief Computes the forward operation for a given tensor using CANN
1552
- * operations.
1553
- *
1554
- * This function selects the appropriate CANN operation based on the type of
1555
- * operation specified in the tensor and performs the computation.
1556
- *
1557
- * @param ctx The CANN context containing necessary resources and
1558
- * configurations.
1559
- * @param dst The destination tensor where the result of the computation will be
1560
- * stored.
1561
- * @return true if the computation was successful; false otherwise.
1562
- */
1563
- static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1564
- struct ggml_tensor* dst) {
1565
- switch (dst->op) {
1566
- case GGML_OP_REPEAT:
1567
- ggml_cann_repeat(ctx, dst);
1568
- break;
1569
- case GGML_OP_GET_ROWS:
1570
- ggml_cann_get_rows(ctx, dst);
1571
- break;
1572
- case GGML_OP_DUP:
1573
- ggml_cann_dup(ctx, dst);
1574
- break;
1575
- case GGML_OP_ADD:
1576
- case GGML_OP_ADD1:
1577
- ggml_cann_binary_op<aclnn_add>(ctx, dst);
1578
- break;
1579
- case GGML_OP_SUB:
1580
- ggml_cann_binary_op<aclnn_sub>(ctx, dst);
1581
- break;
1582
- case GGML_OP_ACC:
1583
- ggml_cann_acc(ctx, dst);
1584
- break;
1585
- case GGML_OP_MUL:
1586
- ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1587
- break;
1588
- case GGML_OP_DIV:
1589
- ggml_cann_binary_op<aclnn_div>(ctx, dst);
1590
- break;
1591
- case GGML_OP_UNARY:
1592
- switch (ggml_get_unary_op(dst)) {
1593
- case GGML_UNARY_OP_ABS:
1594
- GGML_CANN_CALL_UNARY_OP(Abs);
1595
- break;
1596
- case GGML_UNARY_OP_NEG:
1597
- GGML_CANN_CALL_UNARY_OP(Neg);
1598
- break;
1599
- case GGML_UNARY_OP_GELU:
1600
- GGML_CANN_CALL_UNARY_OP(Gelu);
1601
- break;
1602
- case GGML_UNARY_OP_SILU:
1603
- GGML_CANN_CALL_UNARY_OP(Silu);
1604
- break;
1605
- case GGML_UNARY_OP_GELU_QUICK: {
1606
- auto lambda = [](ggml_backend_cann_context& ctx,
1607
- aclTensor* acl_src,
1608
- aclTensor* acl_dst) {
1609
- GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1610
- };
1611
- ggml_cann_unary_op(lambda, ctx, dst);
1612
- } break;
1613
- case GGML_UNARY_OP_TANH:
1614
- GGML_CANN_CALL_UNARY_OP(Tanh);
1615
- break;
1616
- case GGML_UNARY_OP_RELU:
1617
- GGML_CANN_CALL_UNARY_OP(Relu);
1618
- break;
1619
- case GGML_UNARY_OP_SIGMOID:
1620
- GGML_CANN_CALL_UNARY_OP(Sigmoid);
1621
- break;
1622
- case GGML_UNARY_OP_HARDSIGMOID:
1623
- GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
1624
- break;
1625
- case GGML_UNARY_OP_HARDSWISH:
1626
- GGML_CANN_CALL_UNARY_OP(Hardswish);
1627
- break;
1628
- case GGML_UNARY_OP_EXP:
1629
- GGML_CANN_CALL_UNARY_OP(Exp);
1630
- break;
1631
- case GGML_UNARY_OP_ELU:
1632
- ggml_cann_elu(ctx, dst);
1633
- break;
1634
- case GGML_UNARY_OP_SGN:
1635
- GGML_CANN_CALL_UNARY_OP(Sign);
1636
- break;
1637
- case GGML_UNARY_OP_STEP:
1638
- ggml_cann_step(ctx, dst);
1639
- break;
1640
- default:
1641
- return false;
1642
- }
1643
- break;
1644
- case GGML_OP_NORM:
1645
- ggml_cann_norm(ctx, dst);
1646
- break;
1647
- case GGML_OP_GROUP_NORM:
1648
- ggml_cann_group_norm(ctx, dst);
1649
- break;
1650
- case GGML_OP_CONCAT:
1651
- ggml_cann_concat(ctx, dst);
1652
- break;
1653
- case GGML_OP_UPSCALE:
1654
- ggml_cann_upsample_nearest2d(ctx, dst);
1655
- break;
1656
- case GGML_OP_PAD:
1657
- ggml_cann_pad(ctx, dst);
1658
- break;
1659
- case GGML_OP_ARANGE:
1660
- ggml_cann_arange(ctx, dst);
1661
- break;
1662
- case GGML_OP_TIMESTEP_EMBEDDING:
1663
- ggml_cann_timestep_embedding(ctx, dst);
1664
- break;
1665
- case GGML_OP_LEAKY_RELU:
1666
- ggml_cann_leaky_relu(ctx, dst);
1667
- break;
1668
- case GGML_OP_RMS_NORM:
1669
- ggml_cann_rms_norm(ctx, dst);
1670
- break;
1671
- case GGML_OP_MUL_MAT:
1672
- ggml_cann_mul_mat(ctx, dst);
1673
- break;
1674
- case GGML_OP_MUL_MAT_ID:
1675
- ggml_cann_mul_mat_id(ctx, dst);
1676
- break;
1677
- case GGML_OP_SCALE:
1678
- ggml_cann_scale(ctx, dst);
1679
- break;
1680
- case GGML_OP_SQR:
1681
- GGML_ASSERT(dst->src[1] == nullptr);
1682
- dst->src[1] = dst->src[0];
1683
- ggml_cann_binary_op<aclnn_mul>(ctx, dst);
1684
- break;
1685
- case GGML_OP_SQRT:
1686
- GGML_CANN_CALL_UNARY_OP(Sqrt);
1687
- break;
1688
- case GGML_OP_CLAMP:
1689
- ggml_cann_clamp(ctx, dst);
1690
- break;
1691
- case GGML_OP_CPY:
1692
- ggml_cann_cpy(ctx, dst);
1693
- break;
1694
- case GGML_OP_CONT:
1695
- ggml_cann_dup(ctx, dst);
1696
- break;
1697
- case GGML_OP_NONE:
1698
- case GGML_OP_RESHAPE:
1699
- case GGML_OP_VIEW:
1700
- case GGML_OP_PERMUTE:
1701
- case GGML_OP_TRANSPOSE:
1702
- break;
1703
- case GGML_OP_DIAG_MASK_INF:
1704
- ggml_cann_diag_mask(ctx, dst, -INFINITY);
1705
- break;
1706
- case GGML_OP_SOFT_MAX:
1707
- ggml_cann_softmax(ctx, dst);
1708
- break;
1709
- case GGML_OP_ROPE:
1710
- ggml_cann_rope(ctx, dst);
1711
- break;
1712
- case GGML_OP_IM2COL:
1713
- ggml_cann_im2col(ctx, dst);
1714
- break;
1715
- case GGML_OP_POOL_2D:
1716
- ggml_cann_pool2d(ctx, dst);
1717
- break;
1718
- case GGML_OP_SUM:
1719
- ggml_cann_sum(ctx, dst);
1720
- break;
1721
- case GGML_OP_SUM_ROWS:
1722
- ggml_cann_sum_rows(ctx, dst);
1723
- break;
1724
- case GGML_OP_ARGSORT:
1725
- ggml_cann_argsort(ctx, dst);
1726
- break;
1727
- case GGML_OP_ARGMAX:
1728
- ggml_cann_argmax(ctx, dst);
1729
- break;
1730
- case GGML_OP_COS:
1731
- ggml_cann_unary_op<aclnn_cos>(ctx, dst);
1732
- break;
1733
- case GGML_OP_SIN:
1734
- ggml_cann_unary_op<aclnn_sin>(ctx, dst);
1735
- break;
1736
- case GGML_OP_CONV_TRANSPOSE_1D:
1737
- ggml_cann_conv_transpose_1d(ctx, dst);
1738
- break;
1739
- case GGML_OP_LOG:
1740
- GGML_CANN_CALL_UNARY_OP(Log);
1741
- break;
1742
- case GGML_OP_MEAN:
1743
- ggml_cann_mean(ctx, dst);
1744
- break;
1745
- case GGML_OP_PAD_REFLECT_1D:
1746
- ggml_cann_pad_reflect_1d(ctx, dst);
1747
- break;
1748
- case GGML_OP_COUNT_EQUAL:
1749
- ggml_cann_count_equal(ctx, dst);
1750
- break;
1751
- default:
1752
- return false;
1753
- }
1754
-
1755
- return true;
1756
- }
1757
-
1758
- // backend
1759
- /**
1760
- * @brief Retrieves the name associated with the CANN backend.
1761
- *
1762
- * This function returns the name assigned to the CANN backend, which is stored
1763
- * in the context of the provided backend structure.
1764
- *
1765
- * @param backend Pointer to the CANN backend structure.
1766
- * @return A pointer to a constant string representing the backend name.
1767
- */
1768
- static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1769
- ggml_backend_cann_context* cann_ctx =
1770
- (ggml_backend_cann_context*)backend->context;
1771
-
1772
- return cann_ctx->name.c_str();
1773
- }
1774
-
1775
- /**
1776
- * @brief Frees resources associated with the CANN backend.
1777
- *
1778
- * This function releases resources associated with the CANN backend context
1779
- * and resets the device associated with the backend to its initial state.
1780
- *
1781
- * @param backend Pointer to the CANN backend structure to be freed.
1782
- */
1783
- static void ggml_backend_cann_free(ggml_backend_t backend) {
1784
- ggml_backend_cann_context* cann_ctx =
1785
- (ggml_backend_cann_context*)backend->context;
1786
- ACL_CHECK(aclrtSynchronizeDevice());
1787
- ACL_CHECK(aclrtResetDevice(cann_ctx->device));
1788
-
1789
- delete cann_ctx;
1790
- delete backend;
1791
- }
1792
-
1793
-
1794
- /**
1795
- * @brief Sets tensor data asynchronously in the CANN backend.
1796
- *
1797
- * This function asynchronously sets tensor data in the CANN backend.
1798
- *
1799
- * @param backend Pointer to the CANN backend structure.
1800
- * @param tensor Pointer to the tensor structure to set data for.
1801
- * @param data Pointer to the host data to copy to the tensor.
1802
- * @param offset Offset in bytes within the host data.
1803
- * @param size Size of the data to copy in bytes.
1804
- */
1805
- static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1806
- ggml_tensor *tensor,
1807
- const void *data,
1808
- size_t offset,
1809
- size_t size) {
1810
- ggml_backend_cann_context *cann_ctx =
1811
- (ggml_backend_cann_context *)backend->context;
1812
- ggml_backend_buffer_t buf =
1813
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1814
-
1815
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1816
- "unsupported buffer type");
1817
- GGML_ASSERT(!ggml_is_quantized(tensor->type));
1818
-
1819
- ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
1820
- ACL_MEMCPY_HOST_TO_DEVICE);
1821
- }
1822
-
1823
- /**
1824
- * @brief Gets tensor data asynchronously in the CANN backend.
1825
- *
1826
- * This function asynchronously gets tensor data in the CANN backend.
1827
- *
1828
- * @param backend Pointer to the CANN backend structure.
1829
- * @param tensor Pointer to the tensor structure to get data from.
1830
- * @param data Pointer to the host data to copy from the tensor.
1831
- * @param offset Offset in bytes within the host data.
1832
- * @param size Size of the data to copy in bytes.
1833
- */
1834
- static void ggml_backend_cann_get_tensor_async(
1835
- ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1836
- size_t offset, size_t size) {
1837
- ggml_backend_cann_context *cann_ctx =
1838
- (ggml_backend_cann_context *)backend->context;
1839
- ggml_backend_buffer_t buf =
1840
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1841
-
1842
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1843
- "unsupported buffer type");
1844
- GGML_ASSERT(!ggml_is_quantized(tensor->type));
1845
-
1846
- ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
1847
- ACL_MEMCPY_DEVICE_TO_HOST);
1848
-
1849
- }
1850
-
1851
- /**
1852
- * @brief Asynchronously copies tensor data between CANN backends.
1853
- *
1854
- * This function copies tensor data asynchronously between two CANN backends. It
1855
- * checks if both tensors reside in CANN buffers and whether the devices support
1856
- * peer-to-peer access for direct copying. If not, it returns false.
1857
- *
1858
- * @param backend_src Pointer to the source CANN backend structure.
1859
- * @param backend_dst Pointer to the destination CANN backend structure.
1860
- * @param src Pointer to the source tensor to copy data from.
1861
- * @param dst Pointer to the destination tensor to copy data to.
1862
- * @return true if the copy operation succeeds, false otherwise.
1863
- */
1864
- static bool ggml_backend_cann_cpy_tensor_async(
1865
- ggml_backend_t backend_src, ggml_backend_t backend_dst,
1866
- const ggml_tensor* src, ggml_tensor* dst) {
1867
- GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
1868
- ggml_backend_is_cann(backend_dst));
1869
-
1870
- if (!ggml_backend_buffer_is_cann(src->buffer) ||
1871
- !ggml_backend_buffer_is_cann(dst->buffer)) {
1872
- return false;
1873
- }
1874
-
1875
- ggml_backend_buffer_t buf_src =
1876
- src->view_src ? src->view_src->buffer : src->buffer;
1877
- ggml_backend_buffer_t buf_dst =
1878
- dst->view_src ? dst->view_src->buffer : dst->buffer;
1879
-
1880
- ggml_backend_cann_context* cann_ctx_src =
1881
- (ggml_backend_cann_context*)backend_src->context;
1882
- ggml_backend_cann_context* cann_ctx_dst =
1883
- (ggml_backend_cann_context*)backend_dst->context;
1884
-
1885
- size_t copy_size = ggml_nbytes(dst);
1886
- if (backend_src != backend_dst) {
1887
- ggml_backend_cann_buffer_context* buf_ctx_src =
1888
- (ggml_backend_cann_buffer_context*)buf_src->context;
1889
- ggml_backend_cann_buffer_context* buf_ctx_dst =
1890
- (ggml_backend_cann_buffer_context*)buf_dst->context;
1891
-
1892
- GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
1893
- GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
1894
-
1895
- int32_t canAccessPeer = 0;
1896
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
1897
- cann_ctx_dst->device));
1898
- if (!canAccessPeer) {
1899
- return false;
1900
- }
1901
-
1902
- // need open both directions for memcpyasync between devices.
1903
- ggml_cann_set_device(cann_ctx_dst->device);
1904
- ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
1905
- ggml_cann_set_device(cann_ctx_src->device);
1906
- ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
1907
-
1908
- // wait for task_queue empty to keep task order.
1909
- cann_ctx_src->task_queue.wait();
1910
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1911
- ACL_MEMCPY_DEVICE_TO_DEVICE,
1912
- cann_ctx_src->stream()));
1913
-
1914
- //TODO: workaround for Event didn`t work here.
1915
- aclrtSynchronizeStream(cann_ctx_src->stream());
1916
- } else {
1917
- // src and dst are on the same backend
1918
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
1919
- ACL_MEMCPY_DEVICE_TO_DEVICE,
1920
- cann_ctx_dst->stream()));
1921
- }
1922
-
1923
- return true;
1924
- }
1925
-
1926
- /**
1927
- * @brief Synchronizes a CANN backend.
1928
- *
1929
- * This function synchronizes the specified CANN backend by waiting for all
1930
- * operations in its associated stream to complete.
1931
- *
1932
- * @param backend Pointer to the CANN backend structure to synchronize.
1933
- */
1934
- static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1935
- ggml_backend_cann_context* cann_ctx =
1936
- (ggml_backend_cann_context*)backend->context;
1937
- cann_ctx->task_queue.wait();
1938
- ggml_cann_set_device(cann_ctx->device);
1939
- ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1940
- }
1941
-
1942
- /**
1943
- * @brief Computes a computational graph using a CANN backend.
1944
- *
1945
- * This function computes the operations defined in the computational graph
1946
- * using the specified CANN backend.
1947
- *
1948
- * @param backend Pointer to the CANN backend structure to use for computation.
1949
- * @param cgraph Pointer to the computational graph structure containing nodes
1950
- * representing operations to be computed.
1951
- * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
1952
- * completes successfully, otherwise an appropriate error status.
1953
- */
1954
- static enum ggml_status ggml_backend_cann_graph_compute(
1955
- ggml_backend_t backend, ggml_cgraph* cgraph) {
1956
- ggml_backend_cann_context* cann_ctx =
1957
- (ggml_backend_cann_context*)backend->context;
1958
-
1959
- ggml_cann_set_device(cann_ctx->device);
1960
-
1961
- for (int i = 0; i < cgraph->n_nodes; i++) {
1962
- ggml_tensor* node = cgraph->nodes[i];
1963
-
1964
- if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
1965
- continue;
1966
- }
1967
-
1968
- bool ok = ggml_cann_compute_forward(*cann_ctx, node);
1969
-
1970
- if (!ok) {
1971
- GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1972
- node->name, ggml_op_name(node->op));
1973
- }
1974
- GGML_ASSERT(ok);
1975
- }
1976
-
1977
- return GGML_STATUS_SUCCESS;
1978
- }
1979
-
1980
- /**
1981
- * @brief Checks if the CANN backend supports a specific operation.
1982
- *
1983
- * This function checks whether the specified operation is supported by the
1984
- * CANN backend.
1985
- *
1986
- * @param backend Pointer to the CANN backend structure to check support for
1987
- * the operation.
1988
- * @param op Pointer to the tensor representing the operation to check.
1989
- * @return bool Returns true if the operation is supported by the backend,
1990
- * otherwise false.
1991
- */
1992
- static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1993
- const ggml_tensor* op) {
1994
- switch (op->op) {
1995
- case GGML_OP_UNARY:
1996
- switch (ggml_get_unary_op(op)) {
1997
- case GGML_UNARY_OP_ABS:
1998
- case GGML_UNARY_OP_NEG:
1999
- case GGML_UNARY_OP_GELU:
2000
- case GGML_UNARY_OP_SILU:
2001
- case GGML_UNARY_OP_RELU:
2002
- case GGML_UNARY_OP_SIGMOID:
2003
- case GGML_UNARY_OP_HARDSIGMOID:
2004
- case GGML_UNARY_OP_HARDSWISH:
2005
- case GGML_UNARY_OP_GELU_QUICK:
2006
- case GGML_UNARY_OP_TANH:
2007
- case GGML_UNARY_OP_EXP:
2008
- case GGML_UNARY_OP_ELU:
2009
- case GGML_UNARY_OP_SGN:
2010
- case GGML_UNARY_OP_STEP:
2011
- return true;
2012
- default:
2013
- return false;
2014
- }
2015
- case GGML_OP_MUL_MAT: {
2016
- switch (op->src[0]->type) {
2017
- case GGML_TYPE_F16:
2018
- case GGML_TYPE_F32:
2019
- return true;
2020
- case GGML_TYPE_Q8_0:
2021
- case GGML_TYPE_Q4_0:
2022
- #ifdef ASCEND_310P
2023
- // Q4 && Q8 per group is not suppor on 310p device
2024
- return false;
2025
- #endif
2026
- // only support contiguous for quantized types.
2027
- return ggml_is_contiguous(op->src[0]) &&
2028
- ggml_is_contiguous(op->src[1]);
2029
- default:
2030
- return false;
2031
- }
2032
- }
2033
- case GGML_OP_MUL_MAT_ID:
2034
- switch (op->src[0]->type) {
2035
- case GGML_TYPE_F16:
2036
- case GGML_TYPE_F32:
2037
- return true;
2038
- case GGML_TYPE_Q8_0:
2039
- case GGML_TYPE_Q4_0:
2040
- #ifdef ASCEND_310P
2041
- // Q4 && Q8 per group is not suppor on 310p device
2042
- return false;
2043
- #endif
2044
- // only support contiguous for quantized types.
2045
- return ggml_is_contiguous(op->src[0]) &&
2046
- ggml_is_contiguous(op->src[1]);
2047
- default:
2048
- return false;
2049
- }
2050
- // embedding
2051
- case GGML_OP_GET_ROWS: {
2052
- switch (op->src[0]->type) {
2053
- case GGML_TYPE_F32:
2054
- case GGML_TYPE_F16:
2055
- case GGML_TYPE_Q8_0:
2056
- return true;
2057
- default:
2058
- return false;
2059
- }
2060
- } break;
2061
- case GGML_OP_CPY: {
2062
- ggml_tensor *src = op->src[0];
2063
- if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2064
- (src->type != GGML_TYPE_F32 &&
2065
- src->type != GGML_TYPE_F16)) {
2066
- // only support F32 and F16.
2067
- return false;
2068
- }
2069
-
2070
- if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
2071
- // unsupport dst is not contiguous.
2072
- return false;
2073
- }
2074
-
2075
- return true;
2076
- } break;
2077
- case GGML_OP_CONT: {
2078
- // TODO: support GGML_TYPE_BF16
2079
- switch (op->src[0]->type) {
2080
- case GGML_TYPE_F32:
2081
- case GGML_TYPE_F16:
2082
- return true;
2083
- default:
2084
- return false;
2085
- }
2086
- }
2087
- case GGML_OP_ROPE: {
2088
- // TODO: with ops-test v == 1
2089
- float ext_factor = 0.0f;
2090
- memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
2091
- // TODO: n_dims <= ne0
2092
- if (op->src[0]->ne[0] != op->op_params[1]) {
2093
- return false;
2094
- }
2095
- // TODO: ext_factor != 0
2096
- if (ext_factor != 0) {
2097
- return false;
2098
- }
2099
-
2100
- const int mode = ((const int32_t *) op->op_params)[2];
2101
- if (mode & GGML_ROPE_TYPE_MROPE) {
2102
- return false;
2103
- }
2104
- if (mode & GGML_ROPE_TYPE_VISION) {
2105
- return false;
2106
- }
2107
-
2108
- if(!ggml_is_contiguous(op->src[0])){
2109
- return false;
2110
- }
2111
- return true;
2112
- }
2113
- case GGML_OP_UPSCALE: {
2114
- // aclnnUpsampleNearest2dGetWorkspaceSize not support
2115
- // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
2116
- if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
2117
- return false;
2118
- }
2119
- if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2120
- return false;
2121
- }
2122
- return true;
2123
- }
2124
- case GGML_OP_POOL_2D: {
2125
- const int32_t * opts = (const int32_t *) op->op_params;
2126
- #ifdef ASCEND_310P
2127
- enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2128
- if(opt == GGML_OP_POOL_MAX){
2129
- return false;
2130
- }
2131
- #endif
2132
- const int k0 = opts[1];
2133
- const int k1 = opts[2];
2134
- const int p0 = opts[5];
2135
- const int p1 = opts[6];
2136
- // value of paddingH should be at most half of kernelH
2137
- // value of paddingW should be at most half of kernelW
2138
- return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2139
- }
2140
- case GGML_OP_SUM:
2141
- case GGML_OP_DUP:
2142
- case GGML_OP_IM2COL:
2143
- case GGML_OP_CONCAT:
2144
- case GGML_OP_REPEAT:
2145
- case GGML_OP_NONE:
2146
- case GGML_OP_RESHAPE:
2147
- case GGML_OP_VIEW:
2148
- case GGML_OP_PERMUTE:
2149
- case GGML_OP_TRANSPOSE:
2150
- case GGML_OP_NORM:
2151
- case GGML_OP_ADD:
2152
- case GGML_OP_ADD1:
2153
- case GGML_OP_SUB:
2154
- case GGML_OP_MUL:
2155
- case GGML_OP_DIV:
2156
- case GGML_OP_RMS_NORM:
2157
- case GGML_OP_SCALE:
2158
- case GGML_OP_SQR:
2159
- case GGML_OP_SQRT:
2160
- case GGML_OP_CLAMP:
2161
- case GGML_OP_DIAG_MASK_INF:
2162
- case GGML_OP_SOFT_MAX:
2163
- case GGML_OP_SUM_ROWS:
2164
- case GGML_OP_ARGSORT:
2165
- case GGML_OP_ACC:
2166
- case GGML_OP_GROUP_NORM:
2167
- case GGML_OP_PAD:
2168
- case GGML_OP_ARANGE:
2169
- case GGML_OP_TIMESTEP_EMBEDDING:
2170
- case GGML_OP_LEAKY_RELU:
2171
- case GGML_OP_ARGMAX:
2172
- case GGML_OP_COS:
2173
- case GGML_OP_SIN:
2174
- case GGML_OP_CONV_TRANSPOSE_1D:
2175
- case GGML_OP_LOG:
2176
- case GGML_OP_MEAN:
2177
- case GGML_OP_PAD_REFLECT_1D:
2178
- case GGML_OP_COUNT_EQUAL:
2179
- return true;
2180
- default:
2181
- return false;
2182
- }
2183
-
2184
- GGML_UNUSED(dev);
2185
- }
2186
-
2187
- /**
2188
- * @brief Checks if the backend buffer type is associated with the CANN backend.
2189
- *
2190
- * This function checks whether the provided backend buffer type is associated
2191
- * with the CANN backend based on the comparison of its name retrieval function
2192
- * pointer.
2193
- *
2194
- * @param buft Pointer to the backend buffer type to check.
2195
- * @return bool Returns true if the buffer type is associated with the CANN
2196
- * backend, otherwise false.
2197
- */
2198
- static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
2199
- return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
2200
- }
2201
-
2202
- /**
2203
- * @brief Determines if a tensor operation should be offloaded to the CANN
2204
- * backend.
2205
- *
2206
- * This function checks if a given tensor operation should be offloaded to the
2207
- * CANN backend based on the operation type and the size of the tensor. It
2208
- * returns true if the second dimension (ne[1]) of the tensor is greater than or
2209
- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
2210
- *
2211
- * @param backend Pointer to the CANN backend.
2212
- * @param op Pointer to the tensor operation to check.
2213
- * @return bool Returns true if the operation should be offloaded, otherwise
2214
- * false.
2215
- */
2216
- static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
2217
- const ggml_tensor* op) {
2218
- const int min_batch_size = 32;
2219
- GGML_UNUSED(dev);
2220
-
2221
- return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
2222
- }
2223
-
2224
- /**
2225
- * @brief Records an event on the CANN backend stream.
2226
- *
2227
- * This function records the given event on the ACL runtime stream associated
2228
- * with the backend context.
2229
- *
2230
- * @param event Pointer to the event structure to be recorded.
2231
- */
2232
- static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
2233
- ggml_backend_cann_context* cann_ctx =
2234
- (ggml_backend_cann_context*)backend->context;
2235
- ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
2236
- }
2237
-
2238
- /**
2239
- * @brief Waits for a recorded event to complete on the CANN backend stream.
2240
- *
2241
- * This function makes the given backend wait for the event to complete on its
2242
- * ACL runtime stream.
2243
- *
2244
- * @param backend Pointer to the backend structure.
2245
- * @param event Pointer to the event structure that the backend needs to wait
2246
- * for.
2247
- */
2248
- static void ggml_backend_cann_event_wait(ggml_backend_t backend,
2249
- ggml_backend_event_t event) {
2250
- ggml_backend_cann_context* cann_ctx =
2251
- (ggml_backend_cann_context*)backend->context;
2252
- if (ggml_backend_is_cann(backend)) {
2253
- ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
2254
- (aclrtEvent)event->context));
2255
- } else {
2256
- GGML_ABORT("fatal error");
2257
- }
2258
- }
2259
-
2260
- /**
2261
- * @brief Structure defining the interface for the CANN backend.
2262
- *
2263
- * This structure contains function pointers for various operations
2264
- * supported by the CANN backend, including name retrieval, memory
2265
- * management, tensor operations, synchronization, and event handling.
2266
- */
2267
- static const ggml_backend_i ggml_backend_cann_interface = {
2268
- /* .get_name = */ ggml_backend_cann_name,
2269
- /* .free = */ ggml_backend_cann_free,
2270
- /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
2271
- /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
2272
- /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
2273
- /* .synchronize = */ ggml_backend_cann_synchronize,
2274
- /* .graph_plan_create = */ NULL,
2275
- /* .graph_plan_free = */ NULL,
2276
- /* .graph_plan_update = */ NULL,
2277
- /* .graph_plan_compute = */ NULL,
2278
- /* .graph_compute = */ ggml_backend_cann_graph_compute,
2279
- /* .event_record = */ ggml_backend_cann_event_record,
2280
- /* .event_wait = */ ggml_backend_cann_event_wait,
2281
- };
2282
-
2283
- /**
2284
- * @brief Return the hardcoded GUID for the CANN backend.
2285
- *
2286
- * This function returns a static GUID which uniquely identifies the CANN
2287
- * backend.
2288
- *
2289
- * @return A pointer to the static GUID.
2290
- */
2291
- static ggml_guid_t ggml_backend_cann_guid() {
2292
- static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
2293
- 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
2294
- return &guid;
2295
- }
2296
-
2297
- // backend device
2298
- struct ggml_backend_cann_device_context {
2299
- int device;
2300
- std::string name;
2301
- std::string description;
2302
- };
2303
-
2304
- static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
2305
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2306
- return ctx->name.c_str();
2307
- }
2308
-
2309
- static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
2310
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2311
- return ctx->description.c_str();
2312
- }
2313
-
2314
- static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2315
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2316
- ggml_backend_cann_get_device_memory(ctx->device, free, total);
2317
- }
2318
-
2319
- static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
2320
- GGML_UNUSED(dev);
2321
- return GGML_BACKEND_DEVICE_TYPE_GPU;
2322
- }
2323
-
2324
- static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
2325
- props->name = ggml_backend_cann_device_get_name(dev);
2326
- props->description = ggml_backend_cann_device_get_description(dev);
2327
- props->type = ggml_backend_cann_device_get_type(dev);
2328
- ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
2329
-
2330
- bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
2331
-
2332
- props->caps = {
2333
- /* .async = */ false,
2334
- /* .host_buffer = */ host_buffer,
2335
- /* .buffer_from_host_ptr = */ false,
2336
- /* .events = */ true,
2337
- };
2338
- }
2339
-
2340
- static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
2341
- GGML_UNUSED(params);
2342
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2343
- return ggml_backend_cann_init(ctx->device);
2344
- }
2345
-
2346
- /**
2347
- * @brief Checks if the CANN backend supports a specific backend buffer type.
2348
- *
2349
- * This function determines whether the CANN backend supports the given backend
2350
- * buffer type by comparing the device context of the backend and buffer type.
2351
- * It returns true if the devices are same between the backend context and
2352
- * buffer type context.
2353
- *
2354
- * @param backend Pointer to the CANN backend.
2355
- * @param buft Pointer to the backend buffer type to check.
2356
- * @return bool Returns true if the CANN backend supports the buffer type,
2357
- * otherwise false.
2358
- */
2359
- static bool ggml_backend_cann_supports_buft(
2360
- ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2361
- if (ggml_backend_buft_is_cann(buft)) {
2362
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2363
- ggml_backend_cann_buffer_type_context * buft_ctx =
2364
- (ggml_backend_cann_buffer_type_context *)buft->context;
2365
- return buft_ctx->device == dev_ctx->device;
2366
- }
2367
- return false;
2368
- }
2369
-
2370
- static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
2371
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2372
- return ggml_backend_cann_buffer_type(ctx->device);
2373
- }
2374
-
2375
- static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
2376
- GGML_UNUSED(dev);
2377
- return ggml_backend_cann_host_buffer_type();
2378
- }
2379
-
2380
- /**
2381
- * @brief Creates a new event for the CANN backend device.
2382
- *
2383
- * This function initializes a new event for the CANN backend by setting the
2384
- * device and creating an ACL runtime event. The created event is then wrapped
2385
- * in a ggml_backend_event structure and returned.
2386
- *
2387
- * @param backend Pointer to the CANN backend.
2388
- * @return ggml_backend_event_t Returns a pointer to the new event structure.
2389
- */
2390
- static ggml_backend_event_t ggml_backend_cann_device_event_new(
2391
- ggml_backend_dev_t dev) {
2392
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2393
-
2394
- ggml_cann_set_device(dev_ctx->device);
2395
-
2396
- aclrtEvent event;
2397
- ACL_CHECK(aclrtCreateEvent(&event));
2398
-
2399
- return new ggml_backend_event{
2400
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
2401
- /* .context = */ event,
2402
- };
2403
- }
2404
-
2405
- /**
2406
- * @brief Frees a CANN backend event.
2407
- *
2408
- * This function destroys the ACL runtime event associated with the given CANN
2409
- * backend event and then deletes the event structure itself.
2410
- *
2411
- * @param event Pointer to the event structure to be freed.
2412
- */
2413
- static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2414
- ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
2415
-
2416
- delete event;
2417
- GGML_UNUSED(dev);
2418
- }
2419
-
2420
- /**
2421
- * @brief Synchronizes the given event on the CANN backend.
2422
- *
2423
- * This function waits for the specified event to complete on the ACL runtime.
2424
- *
2425
- * @param event Pointer to the event structure to be synchronized.
2426
- */
2427
- static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2428
- ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
2429
-
2430
- GGML_UNUSED(dev);
2431
- }
2432
-
2433
- static const ggml_backend_device_i ggml_backend_cann_device_interface = {
2434
- /* .get_name = */ ggml_backend_cann_device_get_name,
2435
- /* .get_description = */ ggml_backend_cann_device_get_description,
2436
- /* .get_memory = */ ggml_backend_cann_device_get_memory,
2437
- /* .get_type = */ ggml_backend_cann_device_get_type,
2438
- /* .get_props = */ ggml_backend_cann_device_get_props,
2439
- /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2440
- /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
2441
- /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
2442
- /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2443
- /* .supports_op = */ ggml_backend_cann_supports_op,
2444
- /* .supports_buft = */ ggml_backend_cann_supports_buft,
2445
- /* .offload_op = */ ggml_backend_cann_offload_op,
2446
- /* .event_new = */ ggml_backend_cann_device_event_new,
2447
- /* .event_free = */ ggml_backend_cann_device_event_free,
2448
- /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
2449
- };
2450
-
2451
-
2452
- // backend reg
2453
- struct ggml_backend_cann_reg_context {
2454
- std::vector<ggml_backend_dev_t> devices;
2455
- };
2456
-
2457
- static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
2458
- GGML_UNUSED(reg);
2459
- return GGML_CANN_NAME;
2460
- }
2461
-
2462
- static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
2463
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2464
- return ctx->devices.size();
2465
- }
2466
-
2467
- static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
2468
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2469
- GGML_ASSERT(index < ctx->devices.size());
2470
- return ctx->devices[index];
2471
- }
2472
-
2473
- static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
2474
- GGML_UNUSED(reg);
2475
- GGML_UNUSED(name);
2476
- // reserved for future use
2477
- return nullptr;
2478
- }
2479
-
2480
- static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2481
- /* .get_name = */ ggml_backend_cann_reg_get_name,
2482
- /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
2483
- /* .get_device = */ ggml_backend_cann_reg_get_device,
2484
- /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
2485
- };
2486
-
2487
- // backend registry, called only once for cann backend
2488
- ggml_backend_reg_t ggml_backend_cann_reg() {
2489
- static ggml_backend_reg reg;
2490
- static bool initialized = false;
2491
-
2492
- {
2493
- static std::mutex mutex;
2494
- std::lock_guard<std::mutex> lock(mutex);
2495
- if (!initialized) {
2496
- aclInit(nullptr);
2497
- ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
2498
-
2499
- for (int i = 0; i < ggml_cann_info().device_count; i++) {
2500
- ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
2501
- dev_ctx->description = aclrtGetSocName();
2502
- dev_ctx->device = i;
2503
- dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2504
- ggml_cann_set_device(i);
2505
- ggml_backend_dev_t dev = new ggml_backend_device {
2506
- /* .iface = */ ggml_backend_cann_device_interface,
2507
- /* .reg = */ &reg,
2508
- /* .context = */ dev_ctx
2509
- };
2510
- ctx->devices.push_back(dev);
2511
- }
2512
-
2513
- reg = ggml_backend_reg {
2514
- /* .api_version = */ GGML_BACKEND_API_VERSION,
2515
- /* .iface = */ ggml_backend_cann_reg_interface,
2516
- /* .context = */ ctx
2517
- };
2518
- }
2519
-
2520
- initialized = true;
2521
- }
2522
-
2523
- return &reg;
2524
- }
2525
-
2526
- ggml_backend_t ggml_backend_cann_init(int32_t device) {
2527
- aclInit(nullptr);
2528
- if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
2529
- GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
2530
- return nullptr;
2531
- }
2532
-
2533
- ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
2534
- if (ctx == nullptr) {
2535
- GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
2536
- return nullptr;
2537
- }
2538
- ggml_cann_set_device(ctx->device);
2539
- ggml_backend_t cann_backend =
2540
- new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
2541
- /* .interface = */ ggml_backend_cann_interface,
2542
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
2543
- /* .context = */ ctx};
2544
-
2545
- return cann_backend;
2546
- }
2547
-
2548
- bool ggml_backend_is_cann(ggml_backend_t backend) {
2549
- return backend != NULL &&
2550
- ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2551
- }
2552
-
2553
- int32_t ggml_backend_cann_get_device_count() {
2554
- return ggml_cann_info().device_count;
2555
- }
2556
-
2557
- void ggml_backend_cann_get_device_description(
2558
- int32_t device, char* description, size_t description_size) {
2559
- ggml_cann_set_device(device);
2560
- const char* soc_name = aclrtGetSocName();
2561
- snprintf(description, description_size, "%s", soc_name);
2562
- }
2563
-
2564
- void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2565
- size_t* total) {
2566
- ggml_cann_set_device(device);
2567
- ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
2568
- }
2569
-
2570
- GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)