@fugood/llama.node 0.6.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +45 -7
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  27. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  29. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  30. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  31. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  32. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  36. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  37. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  38. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  39. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  40. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  41. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  42. package/src/llama.cpp/cmake/common.cmake +0 -35
  43. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  44. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  45. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  46. package/src/llama.cpp/docs/build.md +0 -561
  47. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  48. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  49. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  50. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  53. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  54. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  56. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  57. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  58. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  59. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  60. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  61. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  62. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  63. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  64. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  68. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  70. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  71. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  72. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  73. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  74. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  75. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  76. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  77. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  79. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  80. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  81. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  82. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  83. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  84. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  85. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  86. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  87. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  88. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  89. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  90. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  91. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  92. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  93. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  94. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  96. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  99. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  100. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  101. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  102. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  103. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  104. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  105. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  106. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  107. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  108. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  109. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  110. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  111. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  112. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  113. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  114. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  115. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  123. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  125. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  127. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  128. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  129. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  130. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  131. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  132. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  135. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  136. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  137. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  138. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  139. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  140. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  141. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  142. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  143. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  144. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  145. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  146. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  147. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  148. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  149. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  150. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  151. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  153. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  155. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  157. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  159. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  161. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  163. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  167. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  169. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  170. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  172. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  173. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  175. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  177. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  183. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  185. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  186. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  187. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  189. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  191. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  195. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  198. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  199. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  200. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  204. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  205. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  206. package/src/llama.cpp/models/.editorconfig +0 -1
  207. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  208. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  212. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  214. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  217. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  225. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  228. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  231. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  233. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  237. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  240. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  242. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  245. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  248. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  256. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  261. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  262. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  263. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  264. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  265. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  266. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  267. package/src/llama.cpp/prompts/assistant.txt +0 -31
  268. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  269. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  270. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  271. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  273. package/src/llama.cpp/prompts/chat.txt +0 -28
  274. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  275. package/src/llama.cpp/prompts/dan.txt +0 -1
  276. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  277. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  278. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  279. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  280. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  281. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  284. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  285. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  286. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  287. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  289. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  290. package/src/llama.cpp/requirements.txt +0 -13
  291. package/src/llama.cpp/scripts/build-info.sh +0 -30
  292. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  293. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  294. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  295. package/src/llama.cpp/tests/get-model.cpp +0 -21
  296. package/src/llama.cpp/tests/get-model.h +0 -2
  297. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  298. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  299. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  300. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  301. package/src/llama.cpp/tests/test-c.c +0 -7
  302. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  303. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  304. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  305. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  306. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  307. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  308. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  309. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  310. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  311. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  312. package/src/llama.cpp/tests/test-log.cpp +0 -39
  313. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  314. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  315. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  316. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  317. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  318. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  319. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  320. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  321. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  322. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  323. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  324. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  325. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  326. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  327. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  328. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  329. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  330. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  331. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  332. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  333. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  334. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  335. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  336. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  337. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  338. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  339. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  340. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  341. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  342. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  343. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  344. package/src/llama.cpp/tools/main/main.cpp +0 -977
  345. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  346. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  347. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  348. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  349. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  350. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  351. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  353. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  354. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  355. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  356. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  357. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  358. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  360. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  361. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  362. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  363. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  364. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  365. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  367. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  368. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  369. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  370. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  371. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  372. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  373. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  374. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  375. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  376. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  377. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,2863 +0,0 @@
1
- /*
2
- * Copyright (c) 2023-2024 The ggml authors
3
- *
4
- * Permission is hereby granted, free of charge, to any person obtaining a copy
5
- * of this software and associated documentation files (the "Software"), to
6
- * deal in the Software without restriction, including without limitation the
7
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8
- * sell copies of the Software, and to permit persons to whom the Software is
9
- * furnished to do so, subject to the following conditions:
10
- *
11
- * The above copyright notice and this permission notice shall be included in
12
- * all copies or substantial portions of the Software.
13
- *
14
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20
- * IN THE SOFTWARE.
21
- */
22
-
23
- #include "aclnn_ops.h"
24
-
25
- #include <aclnnop/aclnn_addcdiv.h>
26
- #include <aclnnop/aclnn_avgpool2d.h>
27
- #include <aclnnop/aclnn_batch_matmul.h>
28
- #include <aclnnop/aclnn_cast.h>
29
- #include <aclnnop/aclnn_constant_pad_nd.h>
30
- #include <aclnnop/aclnn_copy.h>
31
- #include <aclnnop/aclnn_div.h>
32
- #include <aclnnop/aclnn_embedding.h>
33
- #include <aclnnop/aclnn_exp.h>
34
- #include <aclnnop/aclnn_fill_scalar.h>
35
- #include <aclnnop/aclnn_group_norm.h>
36
- #include <aclnnop/aclnn_index_fill_tensor.h>
37
- #include <aclnnop/aclnn_layer_norm.h>
38
- #include <aclnnop/aclnn_matmul.h>
39
- #include <aclnnop/aclnn_max_pool.h>
40
- #include <aclnnop/aclnn_mm.h>
41
- #include <aclnnop/aclnn_permute.h>
42
- #include <aclnnop/aclnn_pow_tensor_tensor.h>
43
- #include <aclnnop/aclnn_reduce_sum.h>
44
- #include <aclnnop/aclnn_repeat.h>
45
- #include <aclnnop/aclnn_repeat_interleave.h>
46
- #include <aclnnop/aclnn_roll.h>
47
- #include <aclnnop/aclnn_softmax.h>
48
- #include <aclnnop/aclnn_tril.h>
49
- #include <aclnnop/aclnn_triu.h>
50
- #include <aclnnop/aclnn_upsample_nearest_2d.h>
51
- #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
52
- #include <aclnnop/aclnn_argmax.h>
53
- #include <aclnnop/aclnn_sum.h>
54
- #include <aclnnop/aclnn_rms_norm.h>
55
- #include <aclnnop/aclnn_im2col.h>
56
- #include <aclnnop/aclnn_add.h>
57
- #include <aclnnop/aclnn_sub.h>
58
- #include <aclnnop/aclnn_mul.h>
59
- #include <aclnnop/aclnn_div.h>
60
- #include <aclnnop/aclnn_convolution.h>
61
- #include <aclnnop/aclnn_elu.h>
62
- #include <aclnnop/aclnn_log.h>
63
- #include <aclnnop/aclnn_mean.h>
64
- #include <aclnnop/aclnn_reflection_pad1d.h>
65
- #include <aclnnop/aclnn_eq_tensor.h>
66
- #include <aclnnop/aclnn_gt_scalar.h>
67
- #include <aclnnop/aclnn_pow.h>
68
- #include <aclnnop/aclnn_grouped_matmul_v2.h>
69
- #include <float.h>
70
-
71
- #include <cmath>
72
- #include <cstring>
73
- #include <exception>
74
- #include <vector>
75
-
76
- #include "ggml-impl.h"
77
-
78
- #define GGML_COMMON_DECL_C
79
-
80
- #include "../ggml-common.h"
81
-
82
- void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
83
- aclTensor ** acl_src1, aclTensor ** acl_dst) {
84
- GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
85
- // Need bcast
86
- if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
87
- BCAST_SHAPE(src0, src1)
88
- *acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
89
- *acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
90
- *acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
91
- } else {
92
- *acl_src0 = ggml_cann_create_tensor(src0);
93
- *acl_src1 = ggml_cann_create_tensor(src1);
94
- *acl_dst = ggml_cann_create_tensor(dst);
95
- }
96
- }
97
-
98
- void ggml_cann_unary_op(
99
- std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
100
- ggml_backend_cann_context& ctx, ggml_tensor* dst) {
101
- ggml_tensor* src = dst->src[0];
102
-
103
- aclTensor* acl_src = ggml_cann_create_tensor(src);
104
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
105
-
106
- unary_op(ctx, acl_src, acl_dst);
107
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
108
- }
109
-
110
- /**
111
- * @brief Repeats elements of a tensor along each dimension according to the
112
- * specified repeat array.
113
- *
114
- * @param ctx The context for the CANN backend operations.
115
- * @param acl_src The source tensor to be repeated.
116
- * @param acl_dst The destination tensor after repeating.
117
- * @param repeat_array The array specifying the number of repetitions along each
118
- * dimension.
119
- */
120
- static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
121
- aclTensor* acl_dst, int64_t* repeat_array) {
122
- // repeat tensor along each dim with repeat_array
123
- aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
124
-
125
- GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst);
126
- ggml_cann_release_resources(ctx, repeats);
127
- }
128
-
129
- /**
130
- * @brief Casts the data type of a source tensor to a destination tensor.
131
- *
132
- * This function casts the data type of the source tensor `acl_src` to the
133
- * specified data type `cast_data_type` and stores the result in the destination
134
- * tensor `acl_dst`.
135
- *
136
- * @param ctx The context for the CANN backend operations.
137
- * @param acl_src The source tensor whose data type will be casted.
138
- * @param acl_dst The destination tensor where the casted result will be stored.
139
- * @param cast_data_type The target data type to which the source tensor will be
140
- * casted.
141
- */
142
- static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
143
- aclTensor* acl_dst, aclDataType cast_data_type) {
144
- GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
145
- }
146
-
147
- void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
148
- ggml_tensor* src = dst->src[0];
149
- GGML_ASSERT(ggml_can_repeat(src, dst));
150
-
151
- aclTensor* acl_src = ggml_cann_create_tensor(src);
152
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
153
-
154
- int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
155
- dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
156
-
157
- aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
158
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
159
- }
160
-
161
- void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
162
- aclTensor* acl_src1, aclTensor* acl_dst) {
163
- float alphaValue = 1.0f;
164
- aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
165
- if (acl_dst != nullptr)
166
- GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
167
- else
168
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha);
169
- ggml_cann_release_resources(ctx, alpha);
170
- }
171
-
172
- void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
173
- aclTensor* acl_src1, aclTensor* acl_dst) {
174
- float alphaValue = 1.0f;
175
- aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
176
- if (acl_dst != nullptr)
177
- GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha, acl_dst);
178
- else
179
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha);
180
- ggml_cann_release_resources(ctx, alpha);
181
- }
182
-
183
- void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
184
- aclTensor* acl_other, aclTensor* acl_dst) {
185
- if (acl_dst != nullptr)
186
- GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
187
- else
188
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
189
- }
190
-
191
- void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
192
- aclTensor* acl_other, aclTensor* acl_dst) {
193
- if (acl_dst != nullptr)
194
- GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
195
- else
196
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
197
- }
198
-
199
- /**
200
- * @brief Multiplies elements of a tensor by a scalar value, optionally
201
- * in-place.
202
- *
203
- * This function multiplies each element of the source tensor `acl_src` by the
204
- * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
205
- * `inplace` is true, `acl_dst` will not be used and the operation is performed
206
- * in-place on `acl_src`.
207
- * The operation is defined as:
208
- * \f[
209
- * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
210
- * \f]
211
- *
212
- * @param ctx The context for the CANN backend operations.
213
- * @param acl_src The source tensor whose elements will be multiplied.
214
- * @param scale The scalar value by which each element of `acl_src` will be
215
- * multiplied.
216
- * @param acl_dst The destination tensor where the result will be stored if
217
- * `inplace` is false.
218
- * @param inplace Flag indicating whether to perform the operation in-place on
219
- * `acl_src`.
220
- */
221
- static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
222
- float scale, aclTensor* acl_dst, bool inplace) {
223
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
224
- if (inplace) {
225
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale);
226
- } else {
227
- GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale, acl_dst);
228
- }
229
- ggml_cann_release_resources(ctx, acl_scale);
230
- }
231
-
232
- void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
233
- ggml_tensor* src = dst->src[0];
234
-
235
- GGML_ASSERT(src->type == GGML_TYPE_F32);
236
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
237
-
238
- aclTensor* acl_src = ggml_cann_create_tensor(src);
239
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
240
-
241
- float negative_slope;
242
- memcpy(&negative_slope, dst->op_params, sizeof(float));
243
- aclScalar* acl_negative_slope =
244
- aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
245
-
246
- GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst);
247
- ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst);
248
- }
249
-
250
- /**
251
- * @brief Concatenates a list of tensors along a specified dimension and stores
252
- * the result in a destination tensor.
253
- *
254
- * @param ctx The context for the CANN backend operations.
255
- * @param tensorList The list of tensors to be concatenated.
256
- * @param acl_dst The destination tensor where the concatenated result will be
257
- * stored.
258
- * @param concat_dim The dimension along which the tensors will be concatenated.
259
- */
260
- static void aclnn_concat(ggml_backend_cann_context& ctx,
261
- aclTensorList* tensorList, aclTensor* acl_dst,
262
- int64_t concat_dim) {
263
- GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
264
- }
265
-
266
- void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
267
- ggml_tensor* src0 = dst->src[0];
268
- ggml_tensor* src1 = dst->src[1];
269
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
270
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
271
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
272
-
273
- const int32_t dim = ggml_get_op_params_i32(dst, 0);
274
-
275
- GGML_ASSERT(dim >= 0 && dim < 4);
276
- int32_t acl_dim = 3 - dim;
277
-
278
- aclTensor* tensors[] = {acl_src0, acl_src1};
279
- aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
280
- aclnn_concat(ctx, tensor_list, acl_dst, acl_dim);
281
-
282
- ggml_cann_release_resources(ctx, tensor_list, acl_dst);
283
- }
284
-
285
- /**
286
- * @brief Creates a tensor with values starting from `start`, incremented by
287
- * `step`, and ending before `stop`.
288
- *
289
- * This function performs the operation:
290
- * \f[
291
- * \text {out }_{i+1}=\text {out }_i+\text {step}
292
- * \f]
293
- * the range is [start, stop).
294
- *
295
- * @param ctx The context for the CANN backend operations.
296
- * @param acl_dst The destination tensor where the values will be stored.
297
- * @param start The starting value of the range.
298
- * @param stop The ending value of the range (exclusive).
299
- * @param step The step size between consecutive values.
300
- * @param n_elements The number of elements in the destination tensor.
301
- */
302
- static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
303
- float start, float stop, float step,
304
- int64_t n_elements) {
305
- int64_t steps = (int64_t)std::ceil((stop - start) / step);
306
- GGML_ASSERT(n_elements == steps);
307
-
308
- aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
309
- aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
310
- aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
311
-
312
- GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst);
313
- ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step);
314
- }
315
-
316
- void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
317
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
318
-
319
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
320
-
321
- int64_t n_elements = ggml_nelements(dst);
322
- float start;
323
- float stop;
324
- float step;
325
- memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
326
- memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
327
- memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
328
-
329
- aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
330
- ggml_cann_release_resources(ctx, acl_dst);
331
- }
332
-
333
- void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
334
- ggml_tensor* src = dst->src[0];
335
-
336
- float min;
337
- float max;
338
- memcpy(&min, dst->op_params, sizeof(float));
339
- memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
340
-
341
- aclTensor* acl_src = ggml_cann_create_tensor(src);
342
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
343
-
344
- aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
345
- aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
346
-
347
- GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst);
348
- ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst);
349
- }
350
-
351
- void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
352
- ggml_tensor* src = dst->src[0];
353
-
354
- // scale factor
355
- float v;
356
- memcpy(&v, dst->op_params, sizeof(float));
357
-
358
- aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
359
- aclTensor* acl_src = ggml_cann_create_tensor(src);
360
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
361
-
362
- GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst);
363
- ggml_cann_release_resources(ctx, scale, acl_src, acl_dst);
364
- }
365
-
366
- void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
367
- ggml_tensor* src = dst->src[0];
368
- enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
369
-
370
- aclTensor* acl_src = ggml_cann_create_tensor(src);
371
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
372
- ggml_cann_pool_alloc temp_buffer_allocator(
373
- ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
374
- void* buffer = temp_buffer_allocator.get();
375
- aclTensor* tmp_tensor =
376
- ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
377
- dst->ne, dst->nb, GGML_MAX_DIMS);
378
- GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false),
379
- tmp_tensor);
380
- GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
381
- ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst);
382
- }
383
-
384
- void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
385
- ggml_tensor* src = dst->src[0];
386
-
387
- aclTensor* acl_src = ggml_cann_create_tensor(src);
388
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
389
-
390
- float eps;
391
- memcpy(&eps, dst->op_params, sizeof(float));
392
-
393
- std::vector<int64_t> normData = {dst->ne[0]};
394
- aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
395
- GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr,
396
- eps, acl_dst, nullptr, nullptr);
397
- ggml_cann_release_resources(ctx, norm, acl_src, acl_dst);
398
- }
399
-
400
- void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
401
- ggml_tensor* src = dst->src[0];
402
-
403
- aclTensor* acl_src = ggml_cann_create_tensor(src);
404
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
405
-
406
- int n_groups = dst->op_params[0];
407
-
408
- float eps;
409
- memcpy(&eps, dst->op_params + 1, sizeof(float));
410
-
411
- int64_t N = src->ne[3];
412
- int64_t C = src->ne[2];
413
- int64_t HxW = src->ne[1] * src->ne[0];
414
-
415
- size_t type_size = ggml_type_size(src->type);
416
- int64_t ne[] = {n_groups, N};
417
- size_t nb[] = {type_size, type_size * n_groups};
418
- size_t n_bytes = N * n_groups;
419
-
420
- ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
421
- void* buffer = temp_buffer_allocator.get();
422
- aclTensor* acl_mean_out = ggml_cann_create_tensor(
423
- buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
424
- aclTensor* acl_rstd_out = ggml_cann_create_tensor(
425
- (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
426
-
427
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
428
- acl_dst, acl_mean_out, acl_rstd_out);
429
- ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out);
430
- }
431
-
432
- void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
433
- ggml_tensor* src0 = dst->src[0];
434
- ggml_tensor* src1 = dst->src[1];
435
-
436
- size_t nb1 = ((int32_t*)dst->op_params)[0];
437
- size_t nb2 = ((int32_t*)dst->op_params)[1];
438
- size_t nb3 = ((int32_t*)dst->op_params)[2];
439
- size_t offset = ((int32_t*)dst->op_params)[3];
440
- bool inplace = (bool)((int32_t*)dst->op_params)[4];
441
-
442
- size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
443
-
444
- aclTensor* acl_dst = ggml_cann_create_tensor(
445
- dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
446
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
447
-
448
- aclScalar* alpha = nullptr;
449
- float alphaValue = 1.0f;
450
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
451
-
452
- if (!inplace) {
453
- size_t cpy_size = ggml_nbytes(dst);
454
- ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
455
- ACL_MEMCPY_DEVICE_TO_DEVICE);
456
- aclTensor* acl_src0 = ggml_cann_create_tensor(
457
- src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
458
-
459
- GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
460
- ggml_cann_release_resources(ctx, acl_src0);
461
- } else {
462
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, acl_src1, alpha);
463
- }
464
- ggml_cann_release_resources(ctx, acl_src1, acl_dst);
465
- }
466
-
467
- /**
468
- * @brief Performs sum reduction on a given tensor along specified dimensions.
469
- *
470
- * This function reduces the input tensor by summing along the specified dimensions.
471
- *
472
- * @param ctx The context for the CANN backend operations.
473
- * @param dst The destination tensor where the reduced result will be stored.
474
- * @param dim An array of dimension indices.
475
- * @param dim_size The number of dimensions.
476
- */
477
- static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
478
- int64_t* dim, size_t dim_size) {
479
- GGML_ASSERT(dst->ne[0] == 1);
480
- ggml_tensor* src = dst->src[0];
481
- aclTensor* acl_src = ggml_cann_create_tensor(src);
482
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
483
- aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size);
484
-
485
- GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true,
486
- ggml_cann_type_mapping(dst->type), acl_dst);
487
- ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims);
488
- }
489
-
490
- void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
491
- int64_t reduce_dims[] = {3};
492
- aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
493
- }
494
-
495
- void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
496
- int64_t reduce_dims[] = {0, 1, 2, 3};
497
- aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
498
- }
499
-
500
- void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
501
- ggml_tensor* dst) {
502
- ggml_tensor* src = dst->src[0];
503
- aclTensor* acl_src =
504
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
505
- aclTensor* acl_dst =
506
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
507
-
508
- std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
509
- auto output_size_array = aclCreateIntArray(output_size.data(), 2);
510
-
511
- GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst);
512
- ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array);
513
- }
514
-
515
- /**
516
- * @brief Pads a tensor with a specified value along each dimension.
517
- *
518
- * This function performs padding of the source tensor `acl_src` and stores the
519
- * result in the destination tensor `acl_dst`. The padding values for each
520
- * dimension are specified in the `paddings` array.
521
- *
522
- * @param ctx The context for the CANN backend operations.
523
- * @param acl_src The source tensor to be padded.
524
- * @param acl_dst The destination tensor where the padded result will be stored.
525
- * @param paddings An array specifying the padding values for each dimension.
526
- * The size of the array should be twice the number of dimensions of the tensor.
527
- * @param value The value to be used for padding. The default value is 0.0.
528
- */
529
- static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
530
- aclTensor* acl_dst, int64_t* paddings,
531
- float value = 0.0f) {
532
- aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
533
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
534
-
535
- GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
536
- ggml_cann_release_resources(ctx, acl_pad, acl_value);
537
- }
538
-
539
- void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
540
- ggml_tensor* src = dst->src[0];
541
- aclTensor* acl_src = ggml_cann_create_tensor(src);
542
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
543
-
544
- // padding: value in the array means how much distance will be padding.
545
- // the position of elements in the array means which dirction to padding,
546
- // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
547
- // dim2.front, dim2.behind, dim3.front, dim3.behind]
548
- int64_t paddings[] = {
549
- 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
550
- 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
551
- aclnn_pad(ctx, acl_src, acl_dst, paddings);
552
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
553
- }
554
-
555
- /**
556
- * @brief Performs 2D average pooling on the input tensor and stores the result
557
- * in the destination tensor.
558
- *
559
- * This function performs average pooling on the source tensor and stores the
560
- * result in the destination tensor. The pooling parameters (kernel size,
561
- * strides, padding) are specified in the `op_params` of the destination tensor.
562
- *
563
- * @param ctx The context for the CANN backend operations.
564
- * @param dst The destination tensor where the result will be stored. The source
565
- * tensor is referenced by `dst->src[0]`.
566
- */
567
- static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
568
- ggml_tensor* dst) {
569
- ggml_tensor* src = dst->src[0];
570
- GGML_ASSERT(src->type == GGML_TYPE_F32);
571
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
572
-
573
- aclTensor* acl_src =
574
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
575
- aclTensor* acl_dst =
576
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
577
-
578
- const int32_t* opts = (const int32_t*)dst->op_params;
579
- const int k0 = opts[1];
580
- const int k1 = opts[2];
581
- const int s0 = opts[3];
582
- const int s1 = opts[4];
583
- const int p0 = opts[5];
584
- const int p1 = opts[6];
585
-
586
- std::vector<int64_t> kernel_dims = {k1, k0};
587
- std::vector<int64_t> stride_dims = {s1, s0};
588
- std::vector<int64_t> padding_avg_dims = {p1, p0}; // (padH, padW)
589
-
590
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
591
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
592
- auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
593
-
594
- bool ceil_mode = false;
595
- bool count_include_pad = true;
596
- int64_t divisor_override = 0;
597
- int8_t cube_math_type = 0;
598
- #ifdef ASCEND_310P
599
- cube_math_type = 1;
600
- #endif
601
-
602
- GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
603
- ceil_mode, count_include_pad, divisor_override,
604
- cube_math_type, acl_dst);
605
- ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides,
606
- paddings_avg);
607
- }
608
-
609
- /**
610
- * @brief Performs 2D max pooling on the input tensor and stores the result in
611
- * the destination tensor.
612
- *
613
- * This function performs max pooling on the source tensor and stores the result
614
- * in the destination tensor. The pooling parameters (kernel size, strides,
615
- * padding) are specified in the `op_params` of the destination tensor.
616
- *
617
- * @param ctx The context for the CANN backend operations.
618
- * @param dst The destination tensor where the result will be stored. The source
619
- * tensor is referenced by `dst->src[0]`.
620
- */
621
- static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
622
- ggml_tensor* dst) {
623
- ggml_tensor* src = dst->src[0];
624
- GGML_ASSERT(src->type == GGML_TYPE_F32);
625
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
626
-
627
- aclTensor* acl_src =
628
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
629
- aclTensor* acl_dst =
630
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
631
-
632
- const int32_t* opts = (const int32_t*)dst->op_params;
633
- const int k0 = opts[1];
634
- const int k1 = opts[2];
635
- const int s0 = opts[3];
636
- const int s1 = opts[4];
637
- const int p0 = opts[5];
638
- const int p1 = opts[6];
639
-
640
- int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
641
- src->ne[3]};
642
- size_t temp_nb[GGML_MAX_DIMS];
643
-
644
- temp_nb[0] = ggml_element_size(src);
645
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
646
- temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
647
- }
648
-
649
- ggml_cann_pool_alloc temp_buffer_allocator(
650
- ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
651
- void* buffer = temp_buffer_allocator.get();
652
- aclTensor* tmp_tensor = ggml_cann_create_tensor(
653
- buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
654
- GGML_MAX_DIMS, ACL_FORMAT_NCHW);
655
-
656
- // pad: see padding in ggml_cann_pad()
657
- int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
658
- float value = -FLT_MAX;
659
- aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
660
-
661
- // max_pool
662
- std::vector<int64_t> kernel_dims = {k1, k0};
663
- std::vector<int64_t> stride_dims = {s1, s0};
664
- // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
665
- std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
666
- std::vector<int64_t> dilation_size = {1, 1};
667
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
668
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
669
- auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
670
- auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
671
-
672
- bool ceil_mode = false;
673
- int64_t auto_pads = 0;
674
- GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
675
- paddings_max, dilations, ceil_mode, acl_dst);
676
- ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size,
677
- strides, paddings_max, dilations);
678
- }
679
-
680
- void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
681
- const int32_t* opts = (const int32_t*)dst->op_params;
682
- enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
683
- switch (op) {
684
- case GGML_OP_POOL_AVG:
685
- ggml_cann_avg_pool2d(ctx, dst);
686
- break;
687
- case GGML_OP_POOL_MAX:
688
- ggml_cann_max_pool2d(ctx, dst);
689
- break;
690
- case GGML_OP_POOL_COUNT:
691
- GGML_ABORT("fatal error");
692
- break;
693
- }
694
- }
695
-
696
- /**
697
- * @brief Copies data from the source tensor to the destination tensor.
698
- *
699
- * This function copies data from the source tensor `acl_src` to the destination
700
- * tensor `acl_dst`.
701
- *
702
- * @param ctx The context for the CANN backend operations.
703
- * @param acl_src The source tensor from which data will be copied.
704
- * @param acl_dst The destination tensor where the data will be copied to.
705
- */
706
- static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
707
- aclTensor* acl_dst) {
708
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
709
- }
710
-
711
- void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
712
- ggml_tensor* src0 = dst->src[0];
713
-
714
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
715
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
716
- if (ggml_are_same_shape(src0, dst)) {
717
- if (dst->type == src0->type) {
718
- cann_copy(ctx, acl_src, acl_dst);
719
- } else {
720
- aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
721
- }
722
- } else {
723
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
724
- if (dst->type == src0->type) {
725
- size_t cpy_size = ggml_nbytes(dst);
726
- ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
727
- ACL_MEMCPY_DEVICE_TO_DEVICE);
728
- return;
729
- } else {
730
- ggml_cann_pool_alloc src_buffer_allocator(
731
- ctx.pool(),
732
- ggml_nelements(dst) * ggml_type_size(dst->type));
733
- void* src_trans_buffer = src_buffer_allocator.get();
734
- size_t src_trans_nb[GGML_MAX_DIMS];
735
- src_trans_nb[0] = ggml_type_size(dst->type);
736
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
737
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
738
- }
739
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
740
- src_trans_buffer, ggml_cann_type_mapping(dst->type),
741
- ggml_type_size(dst->type), src0->ne, src_trans_nb,
742
- GGML_MAX_DIMS);
743
-
744
- aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
745
- size_t cpy_size = ggml_nbytes(dst);
746
- ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
747
- ACL_MEMCPY_DEVICE_TO_DEVICE);
748
- ggml_cann_release_resources(ctx, src_trans_tensor);
749
- return;
750
- }
751
- } else if (ggml_is_contiguous(dst)) {
752
- ggml_cann_pool_alloc src_buffer_allocator(
753
- ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
754
- void* src_trans_buffer = src_buffer_allocator.get();
755
- size_t src_trans_nb[GGML_MAX_DIMS];
756
- src_trans_nb[0] = ggml_type_size(dst->type);
757
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
758
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
759
- }
760
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
761
- src_trans_buffer, ggml_cann_type_mapping(dst->type),
762
- ggml_type_size(dst->type), src0->ne, src_trans_nb,
763
- GGML_MAX_DIMS);
764
-
765
- aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
766
-
767
- size_t cpy_size = ggml_nbytes(dst);
768
- ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
769
- ACL_MEMCPY_DEVICE_TO_DEVICE);
770
- ggml_cann_release_resources(ctx, src_trans_tensor);
771
- return;
772
- } else {
773
- GGML_ABORT("Unsupport dst is not tontiguous.");
774
- }
775
- }
776
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
777
- }
778
-
779
- /**
780
- * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
781
- *
782
- * This function initializes a tensor with zeros using the specified buffer and
783
- * tensor parameters.
784
- *
785
- * @param ctx The context for the CANN backend operations.
786
- * @param buffer The buffer to be used for the tensor data.
787
- * @param n_bytes The size of the buffer in bytes.
788
- * @param ne An array specifying the extents (sizes) of each dimension of the
789
- * tensor.
790
- * @param dims The number of dimensions of the tensor.
791
- * @param type The data type of the tensor.
792
- * @param type_size The size of each element in the tensor data type.
793
- * @return An ACL tensor initialized with zeros.
794
- */
795
- static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
796
- size_t n_bytes, int64_t* ne, int64_t dims,
797
- aclDataType type, size_t type_size) {
798
- size_t nb[GGML_MAX_DIMS];
799
- nb[0] = type_size;
800
- for (int i = 1; i < dims; i++) {
801
- nb[i] = nb[i - 1] * ne[i - 1];
802
- }
803
-
804
- ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
805
- aclTensor* zero =
806
- ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
807
- return zero;
808
- }
809
-
810
- /**
811
- * @brief Creates an ACL tensor initialized with value using a provided buffer.
812
- *
813
- * This function initializes a tensor with value using the specified buffer and
814
- * tensor parameters.
815
- *
816
- * @param ctx The context for the CANN backend operations.
817
- * @param buffer The buffer to be used for the tensor data.
818
- * @param n_bytes The size of the buffer in bytes.
819
- * @param ne An array specifying the extents (sizes) of each dimension of the
820
- * tensor.
821
- * @param dims The number of dimensions of the tensor.
822
- * @param type The data type of the tensor.
823
- * @param type_size The size of each element in the tensor data type.
824
- * @param value The value to be used for initializing the tensor (default
825
- * is 1.0).
826
- * @return An ACL tensor initialized with value.
827
- */
828
- static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
829
- size_t n_bytes, int64_t* ne, int64_t dims,
830
- aclDataType type, size_t type_size,
831
- float value = 1.0f) {
832
- aclTensor* acl_tensor =
833
- aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
834
- float alpha_host = 1.0f;
835
- aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
836
- aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
837
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor, other, alpha);
838
- return acl_tensor;
839
- }
840
-
841
- void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
842
- ggml_tensor* src = dst->src[0];
843
-
844
- aclTensor* acl_src = ggml_cann_create_tensor(src);
845
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
846
-
847
- float eps;
848
- memcpy(&eps, dst->op_params, sizeof(float));
849
- size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
850
- ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
851
-
852
- aclTensor* acl_gamma = aclnn_values(
853
- ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
854
- ggml_cann_type_mapping(src->type), ggml_element_size(src));
855
-
856
- size_t zero_tensor_n_bytes =
857
- src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
858
- ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
859
- aclTensor* acl_rstd =
860
- aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
861
- src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
862
- ggml_element_size(src));
863
- GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
864
- ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
865
- }
866
-
867
- // TODO: performace is low.
868
- void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
869
- float value) {
870
- ggml_tensor* src = dst->src[0];
871
-
872
- aclTensor* acl_src = ggml_cann_create_tensor(src);
873
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
874
-
875
- const int n_past = ((int32_t*)dst->op_params)[0];
876
-
877
- size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
878
- src->ne[3] * ggml_element_size(src);
879
- ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
880
-
881
- aclTensor* mask_tensor =
882
- aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
883
- src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
884
- ggml_element_size(src), value);
885
-
886
- aclScalar* alpha = nullptr;
887
- float alphaValue = 1.0f;
888
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
889
-
890
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1);
891
- GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst);
892
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, mask_tensor, alpha);
893
- ggml_cann_release_resources(ctx, alpha, acl_src, acl_dst, mask_tensor);
894
- }
895
-
896
- /**
897
- * @brief Permutes the dimensions of a tensor according to a specified order.
898
- *
899
- * This function permutes the dimensions of the source tensor `acl_src`
900
- * according to the order specified in the `new_dim` array and stores the result
901
- * in the destination tensor `acl_dst`.
902
- *
903
- * @param ctx The context for the CANN backend operations.
904
- * @param acl_src The source tensor whose dimensions will be permuted.
905
- * @param acl_dst The destination tensor where the permuted result will be
906
- * stored.
907
- * @param new_dim An array specifying the new order of dimensions for the
908
- * tensor.
909
- * @param dims The number of dimensions in the tensor.
910
- */
911
- static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
912
- aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
913
- aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
914
- GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims, acl_dst);
915
- ggml_cann_release_resources(ctx, acl_dims);
916
- }
917
-
918
- static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
919
- ggml_tensor* dst,
920
- ggml_tensor* src1,
921
- aclTensor* tmp_cast_tensor,
922
- aclTensor* tmp_im2col_tensor) {
923
- // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
924
- int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
925
- size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
926
- aclTensor* acl_dst =
927
- ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
928
-
929
- int64_t permute_dim[] = {0, 2, 1};
930
- if (src1->type != dst->type) {
931
- aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
932
- } else {
933
- aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
934
- }
935
-
936
- ggml_cann_release_resources(ctx, acl_dst);
937
- }
938
-
939
- static void ggml_cann_im2col_1d_post_process(
940
- ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
941
- aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
942
- const std::vector<int64_t>& im2col_op_params) {
943
- // get params
944
- const int64_t KH = im2col_op_params[0];
945
- const int64_t KW = im2col_op_params[1];
946
- const int64_t IW = im2col_op_params[2];
947
- const int64_t IC = im2col_op_params[3];
948
- const int64_t N = im2col_op_params[4];
949
- const int64_t OH = im2col_op_params[5];
950
- const int64_t OW = im2col_op_params[6];
951
- const int64_t s0 = im2col_op_params[7];
952
- const int64_t p0 = im2col_op_params[8];
953
- const int64_t d0 = im2col_op_params[9];
954
- const int64_t n_bytes_factor = im2col_op_params[10];
955
-
956
- // Permute: [N, IC * KH * KW, OW * OH] ->
957
- // [N, OW * OH * n_bytes_factor, IC * KH * KW]
958
- ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
959
- tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
960
- void* tmp_permute_buffer = tmp_permute_allocator.get();
961
-
962
- int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
963
- size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
964
- tmp_permute_nb[0] = ggml_type_size(dst->type);
965
- for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
966
- tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
967
- }
968
-
969
- aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
970
- tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
971
- ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
972
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
973
-
974
- int64_t permute_dim[] = {0, 2, 1};
975
- if (src1->type != dst->type) {
976
- aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
977
- } else {
978
- aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
979
- 3);
980
- }
981
-
982
- // number of times the kernel moves in W dimension
983
- const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
984
- size_t offset;
985
- void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
986
-
987
- // memory copy with offset to restore 1D im2col from 2d
988
- if (IC > 1) {
989
- offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
990
- size_t size_cpy = KH * KW * ggml_type_size(dst->type);
991
-
992
- for (int c = 0; c < IC; c++) {
993
- cur_permute_buffer = (char*)tmp_permute_buffer + offset +
994
- KH * KW * c * ggml_type_size(dst->type);
995
- cur_dst_buffer = (char*)dst->data +
996
- c * KH * KW * n_step_w * ggml_type_size(dst->type);
997
-
998
- for (int i = 0; i < n_step_w; i++) {
999
- ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy,
1000
- ACL_MEMCPY_DEVICE_TO_DEVICE);
1001
- cur_dst_buffer =
1002
- (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1003
- cur_permute_buffer = (char*)cur_permute_buffer +
1004
- KH * KW * IC * ggml_type_size(dst->type);
1005
- }
1006
- }
1007
- } else {
1008
- offset = KH * KW * n_step_w *
1009
- ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1010
- ggml_cann_async_memcpy(ctx, dst->data, (char*)tmp_permute_buffer + offset, offset,
1011
- ACL_MEMCPY_DEVICE_TO_DEVICE);
1012
- }
1013
-
1014
- ggml_cann_release_resources(ctx, tmp_permute_tensor);
1015
- }
1016
-
1017
- void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1018
- ggml_tensor* src0 = dst->src[0]; // kernel
1019
- ggml_tensor* src1 = dst->src[1]; // input
1020
-
1021
- GGML_TENSOR_BINARY_OP_LOCALS;
1022
-
1023
- // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
1024
- // im2col and do post-processing to restore it to 1D.
1025
- const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
1026
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
1027
- const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
1028
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
1029
- const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
1030
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
1031
- const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
1032
-
1033
- const int64_t N = ne13;
1034
- const int64_t IC = ne12;
1035
- const int64_t KH = ne01;
1036
- const int64_t KW = ne00;
1037
- const int64_t IW = ne10;
1038
-
1039
- const int64_t OH = is_2D ? ne2 : 1;
1040
- const int64_t OW = ne1;
1041
-
1042
- // memory allocated increased to 3x when is_2D == false
1043
- const int64_t n_bytes_factor = is_2D ? 1 : 3;
1044
-
1045
- // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
1046
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1047
- int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
1048
- size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1049
-
1050
- tmp_im2col_nb[0] = ggml_type_size(src1->type);
1051
- for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1052
- tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
1053
- }
1054
-
1055
- // Calculate im2col.
1056
- // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1057
- // dst.elemcount.
1058
- ggml_cann_pool_alloc im2col_allocator(
1059
- ctx.pool(),
1060
- ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
1061
- void* tmp_im2col_buffer = im2col_allocator.get();
1062
-
1063
- aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
1064
- tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
1065
- ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
1066
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1067
-
1068
- std::vector<int64_t> kernel_dims = {KH, KW};
1069
- std::vector<int64_t> dilation_size = {d1, d0};
1070
- std::vector<int64_t> padding_dims = {p1, p0};
1071
- std::vector<int64_t> stride_dims = {s1, s0};
1072
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1073
- auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1074
- auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1075
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1076
- GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations,
1077
- paddings, strides, tmp_im2col_tensor);
1078
-
1079
- // Cast if dst is f16.
1080
- aclTensor* tmp_cast_tensor = nullptr;
1081
- ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
1082
- void* tmp_cast_buffer = nullptr;
1083
- if (src1->type != dst->type) {
1084
- tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1085
- tmp_cast_buffer = tmp_cast_allocator.get();
1086
- size_t temp_cast_nb[GGML_MAX_DIMS - 1];
1087
- temp_cast_nb[0] = ggml_type_size(dst->type);
1088
- for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1089
- temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
1090
- }
1091
-
1092
- tmp_cast_tensor = ggml_cann_create_tensor(
1093
- tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
1094
- ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
1095
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1096
- aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping(dst->type));
1097
- }
1098
-
1099
- // post-processing
1100
- if (is_2D) {
1101
- ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
1102
- tmp_im2col_tensor);
1103
- } else {
1104
- std::vector<int64_t> im2col_op_params = {
1105
- KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
1106
- ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
1107
- tmp_im2col_tensor, im2col_op_params);
1108
- }
1109
-
1110
- ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor,
1111
- kernel_size, dilations, paddings, strides);
1112
- }
1113
-
1114
- /**
1115
- * @brief Applies element-wise exponential function to the elements of a tensor.
1116
- *
1117
- * This function computes the exponential of each element in the source tensor
1118
- * `acl_src` and stores the result back into the same tensor.
1119
- * The operation is defined as:
1120
- * \f[
1121
- * \text {acl_src }_i=e^{acl\_src_i}
1122
- * \f]
1123
- *
1124
- * @param ctx The context for the CANN backend operations.
1125
- * @param acl_src The tensor on which the exponential function will be applied.
1126
- */
1127
- static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1128
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
1129
- }
1130
-
1131
- void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1132
- aclTensor* acl_dst) {
1133
- GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
1134
- }
1135
-
1136
- void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1137
- aclTensor* acl_dst) {
1138
- GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
1139
- }
1140
-
1141
- void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1142
- ggml_tensor* dst) {
1143
- const ggml_tensor* src = dst->src[0];
1144
-
1145
- GGML_ASSERT(src->type == GGML_TYPE_F32);
1146
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
1147
-
1148
- const int dim = dst->op_params[0];
1149
- const int max_period = dst->op_params[1];
1150
- int half = dim / 2;
1151
-
1152
- aclTensor* acl_src = ggml_cann_create_tensor(src);
1153
-
1154
- // arange: [0, ..., half)
1155
- float start = 0;
1156
- float stop = half;
1157
- float step = 1;
1158
- int64_t n_elements_arange = half;
1159
- int64_t tmp_arange_ne[] = {half};
1160
- size_t tmp_arange_nb[] = {sizeof(dst->type)};
1161
-
1162
- ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
1163
- void* tmp_arange_buffer = arange_allocator.get();
1164
- aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
1165
- tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1166
- ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
1167
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1168
-
1169
- aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
1170
-
1171
- // freq
1172
- float freq_param = -logf(max_period) / half;
1173
- bool inplace = true;
1174
- aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
1175
- aclnn_exp(ctx, tmp_arange_tensor);
1176
-
1177
- // permute: src [0,1,2,3]->[0,1,3,2]
1178
- int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
1179
- size_t tmp_permute_nb[GGML_MAX_DIMS];
1180
- tmp_permute_nb[0] = ggml_type_size(src->type);
1181
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1182
- tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1183
- }
1184
-
1185
- ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1186
- void* tmp_permute_buffer = permute_allocator.get();
1187
- aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
1188
- tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1189
- ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1190
- GGML_MAX_DIMS, ACL_FORMAT_ND);
1191
- int64_t permute_dim[] = {0, 1, 3, 2};
1192
- int64_t num_dims = 4;
1193
- aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims);
1194
-
1195
- // timestep * freq
1196
- int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
1197
- src->ne[3]};
1198
- size_t tmp_mul_nb[GGML_MAX_DIMS];
1199
- tmp_mul_nb[0] = ggml_type_size(src->type);
1200
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1201
- tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
1202
- }
1203
-
1204
- int mul_nelements =
1205
- src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
1206
-
1207
- ggml_cann_pool_alloc mul_allocator(
1208
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
1209
- void* tmp_mul_buffer = mul_allocator.get();
1210
- aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
1211
- tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1212
- ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1213
- ACL_FORMAT_ND);
1214
- aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor);
1215
-
1216
- // cos
1217
- ggml_cann_pool_alloc cos_allocator(
1218
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
1219
- void* tmp_cos_buffer = cos_allocator.get();
1220
- aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
1221
- tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
1222
- ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1223
- ACL_FORMAT_ND);
1224
-
1225
- aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
1226
-
1227
- // sin
1228
- ggml_cann_pool_alloc sin_allocator(
1229
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
1230
- void* tmp_sin_buffer = sin_allocator.get();
1231
- aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
1232
- tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
1233
- ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1234
- ACL_FORMAT_ND);
1235
-
1236
- aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
1237
-
1238
- // concat
1239
- int64_t concat_dim = 3;
1240
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1241
- aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1242
- aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
1243
- aclnn_concat(ctx, tensor_list, acl_dst, concat_dim);
1244
-
1245
- // release
1246
- // segmentation fault when delete both tensorList and his elements.
1247
- ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor,
1248
- tmp_permute_tensor, tmp_mul_tensor, acl_dst);
1249
- }
1250
-
1251
- /**
1252
- * @brief Fills a tensor with a scalar value.
1253
- *
1254
- * This function fills the destination tensor `acl_dst` with the scalar value
1255
- * `scalar`.
1256
- *
1257
- * @param ctx The context for the CANN backend operations.
1258
- * @param scalar The scalar value used to fill the tensor.
1259
- * @param acl_dst The destination tensor to be filled with the scalar value.
1260
- */
1261
- static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1262
- aclTensor* acl_dst) {
1263
- auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1264
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
1265
- ggml_cann_release_resources(ctx, acl_scalar);
1266
- }
1267
-
1268
- /**
1269
- * @brief Raises each element of a tensor to the power of the corresponding
1270
- * element in another tensor.
1271
- *
1272
- * This function computes the element-wise power of the destination tensor
1273
- * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
1274
- * The operation is defined as:
1275
- * \f[
1276
- * \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
1277
- * \f]
1278
- *
1279
- * @param ctx The context for the CANN backend operations.
1280
- * @param acl_dst The destination tensor, which also serves as the base tensor.
1281
- * @param acl_exp The exponent tensor, each element of which is used to raise
1282
- * the corresponding element in the destination tensor.
1283
- */
1284
- static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1285
- aclTensor* acl_dst, aclTensor* acl_exp) {
1286
- GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
1287
- }
1288
-
1289
- /**
1290
- * @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
1291
- * @details This function implements the Alibi mechanism, which introduces
1292
- * learnable biases into the attention scores to simulate relative
1293
- * position encoding without the need for explicit positional
1294
- * embeddings.
1295
- *
1296
- * @param ctx The backend CANN context for executing operations.
1297
- * @param acl_src The source tensor representing the query or key.
1298
- * @param acl_position The position tensor containing relative positions.
1299
- * @param acl_dst The destination tensor where the result will be stored.
1300
- * @param n_head The number of attention heads.
1301
- * @param src_ne The dimensions of the source tensor.
1302
- * @param src_nb0 The byte size of the first dimension of the source
1303
- tensor.
1304
- * @param max_bias The maximum bias value used in the Alibi mechanism.
1305
- * @param dst The destination tensor object for additional metadata.
1306
- *
1307
- * The function performs the following steps:
1308
- * 1. Calculates the logarithm floor of the number of heads to determine the
1309
- base for bias calculation.
1310
- * 2. Initializes arrays with arithmetic sequences and fills them with bias
1311
- values.
1312
- * 3. Computes the bias tensor based on the calculated biases and arithmetic
1313
- sequences.
1314
- * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
1315
- * 5. Multiplies the position tensor by the bias tensor.
1316
- * 6. Adds the result of the multiplication to the source tensor to produce the
1317
- final output.
1318
- */
1319
- static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1320
- aclTensor* acl_position, aclTensor* acl_dst,
1321
- const int n_head, int64_t* src_ne, const size_t src_nb0,
1322
- float max_bias, ggml_tensor* dst) {
1323
- const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
1324
- GGML_ASSERT(src_nb0 == sizeof(float));
1325
- GGML_ASSERT(n_head == src_ne[2]);
1326
-
1327
- const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
1328
-
1329
- float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
1330
- float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
1331
-
1332
- // init arange
1333
- ggml_cann_pool_alloc arange_allocator(ctx.pool(),
1334
- ne2_ne3 * ggml_type_size(dst->type));
1335
- void* tmp_arange_buffer = arange_allocator.get();
1336
-
1337
- // arange1: [1, ..., n_heads_log2_floor+1)
1338
- float start = 1;
1339
- float stop = n_heads_log2_floor + 1;
1340
- float step = 1;
1341
- int64_t n_elements_arange = n_heads_log2_floor;
1342
-
1343
- int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
1344
- size_t tmp_arange1_nb[] = {sizeof(dst->type)};
1345
- aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
1346
- tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1347
- ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
1348
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1349
-
1350
- aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
1351
-
1352
- aclTensor* tmp_arange2_tensor = nullptr;
1353
- if (n_heads_log2_floor < ne2_ne3) {
1354
- // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
1355
- start = 1;
1356
- stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
1357
- step = 2;
1358
- n_elements_arange = ne2_ne3 - n_heads_log2_floor;
1359
- int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1360
- size_t tmp_arange2_nb[] = {sizeof(dst->type)};
1361
-
1362
- aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
1363
- (char*)tmp_arange_buffer +
1364
- n_heads_log2_floor * ggml_type_size(dst->type),
1365
- ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1366
- tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1367
- aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
1368
- n_elements_arange);
1369
- }
1370
-
1371
- // init mk_base
1372
- ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
1373
- ne2_ne3 * ggml_type_size(dst->type));
1374
- void* tmp_mk_base_buffer = mk_base_allocator.get();
1375
- int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
1376
- size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
1377
- aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
1378
- tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
1379
- ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
1380
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1381
-
1382
- aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
1383
-
1384
- aclTensor* tmp_mk_base2_tensor = nullptr;
1385
- if (n_heads_log2_floor < ne2_ne3) {
1386
- int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1387
- size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
1388
- aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
1389
- (char*)tmp_mk_base_buffer +
1390
- n_heads_log2_floor * ggml_type_size(dst->type),
1391
- ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1392
- tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1393
- aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
1394
- }
1395
-
1396
- // init mk
1397
- int64_t tmp_mk_base_ne[] = {ne2_ne3};
1398
- size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
1399
- aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
1400
- tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
1401
- ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
1402
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1403
- aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
1404
- tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1405
- ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
1406
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1407
- aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
1408
-
1409
- // reshape mk
1410
- int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
1411
- size_t tmp_mk_nb[GGML_MAX_DIMS];
1412
- tmp_mk_nb[0] = ggml_type_size(dst->type);
1413
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1414
- tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
1415
- }
1416
- aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
1417
- tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
1418
- ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
1419
- ACL_FORMAT_ND);
1420
-
1421
- // acl_position * mk
1422
- int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
1423
- size_t tmp_output_nb[GGML_MAX_DIMS];
1424
- tmp_output_nb[0] = ggml_type_size(dst->type);
1425
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1426
- tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
1427
- }
1428
- ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
1429
- void* tmp_output_buffer = output_allocator.get();
1430
- aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
1431
- tmp_output_buffer, ggml_cann_type_mapping(dst->type),
1432
- ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
1433
- ACL_FORMAT_ND);
1434
- aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
1435
-
1436
- // add
1437
- aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
1438
- ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
1439
- tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
1440
- tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
1441
- }
1442
-
1443
- void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1444
- ggml_cann_dup(ctx, dst);
1445
- }
1446
-
1447
- /**
1448
- * @brief Applies the softmax function to a tensor along a specified dimension.
1449
- *
1450
- * This function computes the softmax of the source tensor `acl_src` along the
1451
- * specified dimension `dim` and stores the result in the destination tensor
1452
- * `acl_dst`.
1453
- *
1454
- * @param ctx The context for the CANN backend operations.
1455
- * @param acl_src The source tensor on which the softmax function will be
1456
- * applied.
1457
- * @param dim The dimension along which the softmax function will be computed.
1458
- * @param acl_dst The destination tensor where the softmax results will be
1459
- * stored.
1460
- */
1461
- static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1462
- int64_t dim, aclTensor* acl_dst) {
1463
- GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
1464
- }
1465
-
1466
- void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1467
- ggml_tensor* src0 = dst->src[0];
1468
- ggml_tensor* src1 = dst->src[1]; // mask
1469
-
1470
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
1471
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1472
-
1473
- float scale = 1.0f;
1474
- float max_bias = 0.0f;
1475
-
1476
- memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
1477
- memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
1478
-
1479
- // input mul scale
1480
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
1481
-
1482
- size_t n_bytes = ggml_nbytes(src0);
1483
- ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
1484
- void* input_mul_scale_buffer = mul_scale_allocator.get();
1485
- aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
1486
- input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
1487
- src0->nb, GGML_MAX_DIMS);
1488
-
1489
- bool inplace = false;
1490
- aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
1491
-
1492
- // mask
1493
- aclTensor* acl_src1_fp32_tensor = nullptr;
1494
- aclTensor* tmp_mask_tensor = nullptr;
1495
- ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
1496
- if (src1) {
1497
- const bool use_f16 = src1->type == GGML_TYPE_F16;
1498
- if (use_f16) {
1499
- // cast to fp32
1500
- size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
1501
- size_t src1_fp32_nb[GGML_MAX_DIMS];
1502
- src1_fp32_nb[0] = sizeof(float_t);
1503
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1504
- src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
1505
- }
1506
- src1_fp32_allocator.alloc(n_bytes);
1507
- void* src1_fp32_buffer = src1_fp32_allocator.get();
1508
- acl_src1_fp32_tensor = ggml_cann_create_tensor(
1509
- src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
1510
- src1_fp32_nb, GGML_MAX_DIMS);
1511
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1512
- aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
1513
- ggml_cann_release_resources(ctx, acl_src1);
1514
- } else {
1515
- acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
1516
- }
1517
-
1518
- // broadcast the mask across rows, only use ne11 of ne01 in mask
1519
- if (src1->ne[1] != src0->ne[1]) {
1520
- // mask shape: [1,1,ne11,ne10]
1521
- int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
1522
- size_t tmp_mask_nb[GGML_MAX_DIMS];
1523
- tmp_mask_nb[0] = sizeof(float_t);
1524
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1525
- tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
1526
- }
1527
- tmp_mask_tensor = ggml_cann_create_tensor(
1528
- src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
1529
- GGML_MAX_DIMS, ACL_FORMAT_ND);
1530
- }
1531
-
1532
- // alibi
1533
- const int n_head = src0->ne[2];
1534
- const size_t src_nb0 = src0->nb[0];
1535
-
1536
- n_bytes = ggml_nbytes(dst);
1537
- ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
1538
- void* output_buffer = output_allocator.get();
1539
- aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
1540
- output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
1541
- dst->nb, GGML_MAX_DIMS);
1542
- if (max_bias <= 0.0f) {
1543
- // slope = 1.0
1544
- if (tmp_mask_tensor) {
1545
- aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
1546
- alibi_output_tensor);
1547
- } else {
1548
- aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
1549
- alibi_output_tensor);
1550
- }
1551
- } else {
1552
- // slope != 1.0
1553
- if (tmp_mask_tensor) {
1554
- aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
1555
- alibi_output_tensor, n_head, src0->ne, src_nb0,
1556
- max_bias, dst);
1557
- } else {
1558
- aclnn_alibi(ctx, acl_input_mul_scale_tensor,
1559
- acl_src1_fp32_tensor, alibi_output_tensor, n_head,
1560
- src0->ne, src_nb0, max_bias, dst);
1561
- }
1562
- }
1563
-
1564
- // softmax
1565
- aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
1566
- ggml_cann_release_resources(ctx, alibi_output_tensor);
1567
- } else {
1568
- aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
1569
- }
1570
-
1571
- ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
1572
- acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
1573
- }
1574
-
1575
- /**
1576
- * @brief Performs embedding operation on a 4D tensor using the CANN backend.
1577
- *
1578
- * This function extracts slices from the source tensor (`src_buffer`),
1579
- * index tensor (`index`), and destination tensor (`dst`), and performs an
1580
- * embedding operation on them. The embedding operation is applied by iterating
1581
- * over the last two dimensions of the source tensor, creating the necessary
1582
- * tensors for the source, index, and output, and executing the embedding operation.
1583
- *
1584
- * @param ctx The context for CANN backend operations.
1585
- * @param src_buffer The source buffer holding the data for the source tensor.
1586
- * @param src_ne The dimensions of the source tensor.
1587
- * @param src_nb The strides (byte offsets) of the source tensor.
1588
- * @param index The index tensor used in the embedding operation.
1589
- * @param dst The destination tensor where the result will be stored.
1590
- */
1591
- static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
1592
- int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
1593
- ggml_tensor* dst) {
1594
- for (int64_t i = 0; i < src_ne[3]; i++) {
1595
- for (int64_t j = 0; j < src_ne[2]; j++) {
1596
- // src
1597
- int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
1598
- size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
1599
- aclTensor* acl_src_tensor = ggml_cann_create_tensor(
1600
- (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
1601
- ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
1602
- acl_src_ne, acl_src_nb, 2);
1603
-
1604
- // index
1605
- int64_t acl_index_ne[1] = {index->ne[0]};
1606
- size_t acl_index_nb[1] = {index->nb[0]};
1607
- aclTensor* acl_index = ggml_cann_create_tensor(
1608
- (char*)index->data + i * index->nb[2] + j * index->nb[1],
1609
- ggml_cann_type_mapping(index->type), ggml_element_size(index),
1610
- acl_index_ne, acl_index_nb, 1);
1611
-
1612
- // out
1613
- int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
1614
- size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
1615
- aclTensor* acl_out = ggml_cann_create_tensor(
1616
- (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
1617
- ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
1618
- acl_out_ne, acl_out_nb, 2);
1619
- GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
1620
- ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
1621
- }
1622
- }
1623
- }
1624
-
1625
- void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1626
- ggml_tensor* src0 = dst->src[0]; // src
1627
- ggml_tensor* src1 = dst->src[1]; // index
1628
-
1629
- switch (src0->type) {
1630
- case GGML_TYPE_F32: {
1631
- aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
1632
- dst);
1633
- break;
1634
- }
1635
- case GGML_TYPE_F16: {
1636
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
1637
- ggml_cann_pool_alloc src_buffer_allocator(
1638
- ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
1639
- void* src_trans_buffer = src_buffer_allocator.get();
1640
- size_t src_trans_nb[GGML_MAX_DIMS];
1641
- src_trans_nb[0] = sizeof(float_t);
1642
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1643
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1644
- }
1645
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
1646
- src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
1647
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
1648
- aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
1649
- aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
1650
- src_trans_nb, src1, dst);
1651
- ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
1652
- break;
1653
- }
1654
- case GGML_TYPE_Q8_0: {
1655
- // add 1 dim for bcast mul.
1656
- size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
1657
- dequant_nb[GGML_MAX_DIMS + 1];
1658
- int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
1659
- *dequant_ne;
1660
- int64_t scale_offset = 0;
1661
-
1662
- // [3,4,5,64] -> [3,4,5,2,32]
1663
- weight_ne[0] = QK8_0;
1664
- weight_ne[1] = src0->ne[0] / QK8_0;
1665
- weight_nb[0] = sizeof(int8_t);
1666
- weight_nb[1] = weight_nb[0] * weight_ne[0];
1667
- for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1668
- weight_ne[i] = src0->ne[i - 1];
1669
- weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
1670
- }
1671
-
1672
- // [3,4,5,64] -> [3,4,5,2,1]
1673
- scale_ne[0] = 1;
1674
- scale_ne[1] = src0->ne[0] / QK8_0;
1675
- scale_nb[0] = sizeof(uint16_t);
1676
- scale_nb[1] = scale_nb[0] * scale_ne[0];
1677
- for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1678
- scale_ne[i] = src0->ne[i - 1];
1679
- scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
1680
- }
1681
-
1682
- // [3,4,5,64] -> [3,4,5,2,32]
1683
- dequant_ne = weight_ne;
1684
- dequant_nb[0] = sizeof(float_t);
1685
- for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
1686
- dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
1687
- }
1688
-
1689
- scale_offset = ggml_nelements(src0) * sizeof(int8_t);
1690
- ggml_cann_pool_alloc dequant_buffer_allocator(
1691
- ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
1692
-
1693
- aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
1694
- src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
1695
- GGML_MAX_DIMS + 1);
1696
- aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1697
- src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
1698
- GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1699
- aclTensor* dequant_tensor = ggml_cann_create_tensor(
1700
- dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
1701
- dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
1702
-
1703
- aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
1704
- dequant_nb[0] = sizeof(float_t);
1705
- dequant_ne = src0->ne;
1706
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1707
- dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
1708
- }
1709
-
1710
- aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
1711
- dequant_ne, dequant_nb, src1, dst);
1712
-
1713
- ggml_cann_release_resources(ctx, dequant_tensor);
1714
- break;
1715
- }
1716
- default:
1717
- GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
1718
- break;
1719
- }
1720
- }
1721
-
1722
- /**
1723
- * @brief Repeats elements of a tensor along a specified dimension.
1724
- *
1725
- * This function repeats each element of the source tensor `acl_src` a specified
1726
- * number of times (`repeats`) along the specified dimension `dim` and stores
1727
- * the result in the destination tensor `acl_dst`.
1728
- *
1729
- * @param ctx The context for the CANN backend operations.
1730
- * @param acl_src The source tensor whose elements will be repeated.
1731
- * @param acl_dst The destination tensor where the repeated elements will be
1732
- * stored.
1733
- * @param dim The dimension along which the elements will be repeated.
1734
- * @param repeats The number of times each element will be repeated.
1735
- * @param output_size The size of the output tensor.
1736
- */
1737
- static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
1738
- aclTensor* acl_src, aclTensor* acl_dst,
1739
- int64_t dim, int64_t repeats,
1740
- int64_t output_size) {
1741
- GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim,
1742
- output_size, acl_dst);
1743
- }
1744
-
1745
- /**
1746
- * @brief Performs matrix multiplication with floating-point precision on
1747
- * tensors using the CANN backend.
1748
- *
1749
- * This function performs matrix multiplication of the input tensor and the
1750
- * weight tensor, handling broadcasting and transposing as needed, and stores
1751
- * the result in the destination tensor `dst`.
1752
- *
1753
- * @param ctx The context for the CANN backend operations.
1754
- * @param dst The destination tensor where the result of the matrix
1755
- * multiplication will be stored.
1756
- */
1757
- static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1758
- ggml_tensor* dst) {
1759
- ggml_tensor* weight = dst->src[0]; // weight
1760
- ggml_tensor* input = dst->src[1]; // input
1761
-
1762
- // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
1763
- // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
1764
- BCAST_MUL_MAT_SHAPE(input, weight, dst);
1765
-
1766
- int64_t n_dims = bcast_dims;
1767
- if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
1768
- if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
1769
- n_dims = 2;
1770
- } else if (bcast_input_ne[2] == 1) {
1771
- n_dims = 3;
1772
- }
1773
- }
1774
-
1775
- aclTensor* acl_input_tensor =
1776
- ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
1777
- int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
1778
- bcast_weight_ne[2], bcast_weight_ne[3],
1779
- bcast_weight_ne[4], bcast_weight_ne[5]};
1780
- size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
1781
- bcast_weight_nb[2], bcast_weight_nb[3],
1782
- bcast_weight_nb[4], bcast_weight_nb[5]};
1783
- aclTensor* acl_weight_tensor =
1784
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
1785
- aclTensor* acl_dst =
1786
- ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1787
-
1788
- switch (n_dims) {
1789
- case 2:
1790
- GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1791
- break;
1792
- case 3:
1793
- GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1794
- break;
1795
- default:
1796
- // ALLOW_FP32_DOWN_PRECISION, when input is
1797
- // fp32, atlas a2 will transpose it to HFLOAT32.
1798
- GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor, acl_weight_tensor, acl_dst, 1);
1799
- break;
1800
- }
1801
-
1802
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_input_tensor, acl_dst);
1803
- }
1804
-
1805
- /**
1806
- * @brief Performs matrix multiplication with quantized weights and
1807
- * floating-point inputs using the CANN backend.
1808
- *
1809
- * This function performs matrix multiplication of the input tensor `src1` and
1810
- * the weight tensor `src0`, handling broadcasting, transposing, and
1811
- * quantization as needed, and stores the result in the destination tensor
1812
- * `dst`.
1813
- *
1814
- * @param ctx The context for the CANN backend operations.
1815
- * @param dst The destination tensor where the result of the matrix
1816
- * multiplication will be stored.
1817
- */
1818
- static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
1819
- ggml_tensor* dst,
1820
- const enum ggml_type type) {
1821
- ggml_tensor* src0 = dst->src[0]; // weight
1822
- ggml_tensor* src1 = dst->src[1]; // input
1823
-
1824
- // The shape of the weight is NCHW.
1825
- // Matrix multiplication uses HW dims.
1826
- // HC is regarded as batch.
1827
- // weight need transpose.
1828
- float weight_elem_size;
1829
- if (type == GGML_TYPE_Q4_0) {
1830
- weight_elem_size = float(sizeof(uint8_t)) / 2;
1831
- } else if (type == GGML_TYPE_Q8_0) {
1832
- weight_elem_size = float(sizeof(uint8_t));
1833
- } else {
1834
- GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
1835
- }
1836
- float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
1837
- size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
1838
- size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
1839
-
1840
- // scale stored at the end of weight. Also need transpose.
1841
- size_t scale_elem_size = sizeof(uint16_t);
1842
- size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
1843
- scale_elem_size};
1844
- size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
1845
- char* scale_offset = (char*)src0->data + weight_size;
1846
-
1847
- // input
1848
- size_t input_elem_size = sizeof(uint16_t);
1849
- int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
1850
- size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
1851
- size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
1852
- ggml_cann_pool_alloc input_alloctor(ctx.pool());
1853
- void* input_buffer = src1->data;
1854
-
1855
- // case in
1856
- if (src1->type != GGML_TYPE_F16) {
1857
- aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
1858
- input_buffer =
1859
- input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
1860
-
1861
- int64_t* input_cast_ne = src1->ne;
1862
- size_t input_cast_nb[GGML_MAX_DIMS];
1863
- input_cast_nb[0] = sizeof(uint16_t);
1864
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1865
- input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
1866
- }
1867
-
1868
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
1869
- input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
1870
- input_cast_nb, GGML_MAX_DIMS);
1871
- aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
1872
- ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor);
1873
- }
1874
-
1875
- // output
1876
- size_t output_elem_size = sizeof(uint16_t);
1877
- size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
1878
- ggml_cann_pool_alloc output_allocator(ctx.pool());
1879
- void* output_buffer =
1880
- output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
1881
- size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
1882
-
1883
- // aclnn
1884
- int64_t max_elem_size = 65535;
1885
- int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
1886
- ggml_cann_pool_alloc workspace_allocator(ctx.pool());
1887
- for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
1888
- for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
1889
- int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
1890
- int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
1891
-
1892
- int64_t batch1 = (n1 * src1->ne[2]) + c1;
1893
- int64_t batch0 = (n0 * src0->ne[2]) + c0;
1894
-
1895
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
1896
- (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
1897
- input_elem_size, input_ne, input_nb, 2);
1898
-
1899
- // first split
1900
- int64_t weight_ne_offset = 0;
1901
- int64_t weight_ne[2] = {
1902
- max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
1903
- src0->ne[0]};
1904
- int64_t scale_ne_offset = 0;
1905
- int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
1906
- int64_t output_ne_offset = 0;
1907
- int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
1908
-
1909
- aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
1910
- (char*)src0->data + batch0 * weight_stride,
1911
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
1912
- weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
1913
- aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1914
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
1915
- scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
1916
- scale_ne_offset);
1917
- aclTensor* acl_output_tensor = ggml_cann_create_tensor(
1918
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
1919
- output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
1920
- output_ne_offset);
1921
- int64_t antiquantGroupSize = 0;
1922
- if (src0->ne[0] > QK8_0) {
1923
- antiquantGroupSize = QK8_0;
1924
- }
1925
- GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
1926
- acl_weight_tensor, acl_scale_tensor, nullptr,
1927
- nullptr, nullptr, nullptr, antiquantGroupSize,
1928
- acl_output_tensor);
1929
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
1930
-
1931
- // other splits
1932
- for (int64_t split = 1; split < split_size; split++) {
1933
- weight_ne_offset +=
1934
- weight_elem_size * weight_ne[0] * weight_ne[1];
1935
- weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
1936
- ? src0->ne[1] - (max_elem_size * split)
1937
- : max_elem_size;
1938
- scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
1939
- scale_ne[0] = weight_ne[0];
1940
- output_ne_offset +=
1941
- output_elem_size * output_ne[0] * output_ne[1];
1942
- output_ne[0] = weight_ne[0];
1943
-
1944
- acl_weight_tensor = ggml_cann_create_tensor(
1945
- (char*)src0->data + batch0 * weight_stride,
1946
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
1947
- weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
1948
- acl_scale_tensor = ggml_cann_create_tensor(
1949
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
1950
- scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
1951
- scale_ne_offset);
1952
- acl_output_tensor = ggml_cann_create_tensor(
1953
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
1954
- output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
1955
- output_ne_offset);
1956
- GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
1957
- acl_weight_tensor, acl_scale_tensor, nullptr,
1958
- nullptr, nullptr, nullptr, antiquantGroupSize,
1959
- acl_output_tensor);
1960
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
1961
- }
1962
-
1963
- ggml_cann_release_resources(ctx, acl_input_tensor);
1964
- }
1965
- }
1966
-
1967
- // cast out
1968
- if (dst->type != GGML_TYPE_F16) {
1969
- int64_t* output_cast_ne = dst->ne;
1970
- size_t output_cast_nb[GGML_MAX_DIMS];
1971
- output_cast_nb[0] = sizeof(uint16_t);
1972
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1973
- output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
1974
- }
1975
-
1976
- aclTensor* acl_output_tensor = ggml_cann_create_tensor(
1977
- output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
1978
- output_cast_nb, GGML_MAX_DIMS);
1979
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
1980
- aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
1981
-
1982
- ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor);
1983
- }
1984
- }
1985
-
1986
- void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1987
- const enum ggml_type type = dst->src[0]->type;
1988
- switch (type) {
1989
- case GGML_TYPE_F32:
1990
- case GGML_TYPE_F16:
1991
- ggml_cann_mat_mul_fp(ctx, dst);
1992
- break;
1993
- case GGML_TYPE_Q4_0:
1994
- case GGML_TYPE_Q8_0:
1995
- ggml_cann_mul_mat_quant(ctx, dst, type);
1996
- break;
1997
- default:
1998
- GGML_ABORT("Unsupported type for mul_mat");
1999
- break;
2000
- }
2001
- }
2002
-
2003
- /**
2004
- * @brief Rolls the elements of a tensor along a specified dimension.
2005
- *
2006
- * This function rolls the elements of the source tensor `acl_src` by the
2007
- * specified shifts `shifts` along the specified dimensions `dims`, and stores
2008
- * the result in the destination tensor `acl_dst`.
2009
- *
2010
- * @param ctx The context for the CANN backend operations.
2011
- * @param acl_src The source tensor whose elements will be rolled.
2012
- * @param acl_dst The destination tensor where the rolled elements will be
2013
- * stored.
2014
- * @param shifts An array specifying the number of positions by which elements
2015
- * are shifted.
2016
- * @param dims An array specifying the dimensions along which elements are
2017
- * shifted.
2018
- */
2019
- static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2020
- aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2021
- aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2022
- aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2023
- GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts, acl_dims, acl_dst);
2024
- ggml_cann_release_resources(ctx, acl_shifts, acl_dims);
2025
- }
2026
-
2027
- /**
2028
- * @brief Fills specified positions of a tensor with a scalar value.
2029
- *
2030
- * This function fills the positions in the source tensor `acl_src` specified by
2031
- * `index` along the dimension `dim` with the scalar value `value`.
2032
- *
2033
- * @param ctx The context for the CANN backend operations.
2034
- * @param acl_src The source tensor where the positions will be filled.
2035
- * @param dim The dimension along which the positions are specified.
2036
- * @param index An array specifying the positions to be filled.
2037
- * @param index_num The number of positions specified in the index array.
2038
- * @param value The scalar value used to fill the specified positions.
2039
- */
2040
- static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2041
- aclTensor* acl_src, int64_t dim,
2042
- int64_t* index, int64_t index_num,
2043
- float value) {
2044
- aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2045
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2046
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value);
2047
- ggml_cann_release_resources(ctx, acl_index, acl_value);
2048
- }
2049
-
2050
- static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2051
- aclTensor* acl_cos_repeat_tensor,
2052
- aclTensor* acl_sin_repeat_tensor,
2053
- float theta_scale, float freq_scale,
2054
- float attn_factor, bool is_neox) {
2055
- // int sin/cos cache, cache has different repeat method depond on
2056
- // @param.is_neox
2057
-
2058
- ggml_tensor* src0 = dst->src[0]; // input
2059
- ggml_tensor* src1 = dst->src[1]; // position
2060
- ggml_tensor* src2 = dst->src[2]; // freq_factors
2061
-
2062
- GGML_TENSOR_BINARY_OP_LOCALS
2063
-
2064
- // theta_scale arange, [0,1,...,ne00/2 - 1]
2065
- int64_t theta_scale_length = ne00 / 2;
2066
- ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
2067
- theta_scale_length * sizeof(float_t));
2068
- void* theta_scale_buffer = theta_scale_allocator.get();
2069
- int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
2070
- size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
2071
- theta_scale_length * sizeof(float_t)};
2072
-
2073
- aclTensor* acl_theta_scale_tensor =
2074
- ggml_cann_create_tensor(theta_scale_buffer, ACL_FLOAT, sizeof(float_t),
2075
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2076
- float start = 0;
2077
- float step = 1;
2078
- float stop = ne00 / 2;
2079
- float n_elements = ne00 / 2;
2080
- aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
2081
-
2082
- // power
2083
- aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2084
- GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
2085
- acl_theta_scale_tensor);
2086
-
2087
- // freq_scale
2088
- if (freq_scale != 1) {
2089
- aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
2090
- }
2091
-
2092
- // freq_factors
2093
- if (src2) {
2094
- aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
2095
- src2->data, ggml_cann_type_mapping(src2->type),
2096
- ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2097
- aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
2098
- ggml_cann_release_resources(ctx, acl_freq_factors_tensor);
2099
- }
2100
-
2101
- // position
2102
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
2103
- int64_t position_length = src1->ne[0];
2104
- int64_t position_ne[] = {1, 1, position_length, 1};
2105
- size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t),
2106
- sizeof(int32_t) * position_length};
2107
- aclTensor* acl_position_tensor = ggml_cann_create_tensor(
2108
- src1->data, ggml_cann_type_mapping(src1->type),
2109
- ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
2110
-
2111
- // power * position
2112
- int64_t theta_length = theta_scale_length * position_length;
2113
- ggml_cann_pool_alloc theta_allocator(ctx.pool(),
2114
- theta_length * sizeof(float_t));
2115
- void* theta_buffer = theta_allocator.get();
2116
- int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
2117
- size_t theta_nb[GGML_MAX_DIMS];
2118
- theta_nb[0] = sizeof(float_t);
2119
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2120
- theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
2121
- }
2122
- aclTensor* acl_theta_tensor =
2123
- ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
2124
- theta_ne, theta_nb, GGML_MAX_DIMS);
2125
- aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
2126
- acl_theta_tensor);
2127
-
2128
- // sin/cos
2129
- ggml_cann_pool_alloc sin_allocator(ctx.pool(),
2130
- theta_length * sizeof(float_t));
2131
- void* sin_buffer = sin_allocator.get();
2132
- aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
2133
- sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
2134
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2135
- aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
2136
-
2137
- ggml_cann_pool_alloc cos_allocator(ctx.pool(),
2138
- theta_length * sizeof(float_t));
2139
- void* cos_buffer = cos_allocator.get();
2140
- aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
2141
- cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
2142
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2143
- aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
2144
-
2145
- // attn_factor
2146
- if (attn_factor != 1) {
2147
- aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
2148
- aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
2149
- }
2150
-
2151
- // repeat
2152
- if (is_neox) {
2153
- int64_t repeatsArray[] = {1, 1, 1, 2};
2154
- aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
2155
- aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
2156
- } else {
2157
- int64_t num_repeats = 2;
2158
- int64_t dim = 3;
2159
- int64_t output_size = theta_scale_length * num_repeats;
2160
- aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
2161
- num_repeats, output_size);
2162
- aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
2163
- num_repeats, output_size);
2164
- }
2165
-
2166
- // release
2167
- ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
2168
- acl_theta_tensor, acl_sin_tensor, acl_cos_tensor, acl_theta_scale);
2169
- }
2170
-
2171
- #ifdef __cplusplus
2172
- extern "C" {
2173
- #endif
2174
- aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
2175
- const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
2176
- int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
2177
- aclOpExecutor** executor);
2178
- aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
2179
- uint64_t workspaceSize,
2180
- aclOpExecutor* executor,
2181
- aclrtStream stream);
2182
- #ifdef __cplusplus
2183
- }
2184
- #endif
2185
-
2186
- void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2187
- // TODO: use ascendc
2188
- // Only test with LLAMA model.
2189
- ggml_tensor* src0 = dst->src[0]; // input
2190
-
2191
- // param
2192
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2193
- // const int n_past = ((int32_t *) dst->op_params)[0];
2194
- const int n_dims = ((int32_t*)dst->op_params)[1];
2195
- const int mode = ((int32_t*)dst->op_params)[2];
2196
- // const int n_ctx = ((int32_t *) dst->op_params)[3];
2197
- const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
2198
-
2199
- GGML_TENSOR_UNARY_OP_LOCALS
2200
-
2201
- memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
2202
- memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
2203
- memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
2204
- memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
2205
- memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
2206
- memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
2207
-
2208
- // TODO: n_dims <= ne0
2209
- GGML_ASSERT(n_dims == ne0);
2210
- GGML_ASSERT(n_dims % 2 == 0);
2211
- // TODO: ext_factor != 0
2212
- GGML_ASSERT(ext_factor == 0);
2213
-
2214
- const float theta_scale = powf(freq_base, -2.0f / n_dims);
2215
-
2216
- float corr_dims[2];
2217
- ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
2218
- beta_slow, corr_dims);
2219
-
2220
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
2221
-
2222
- // init cos/sin cache
2223
- ggml_cann_pool_alloc sin_allocator(
2224
- ctx.pool(), ne00 * ne02 * sizeof(float_t));
2225
- ggml_cann_pool_alloc cos_allocator(
2226
- ctx.pool(), ne00 * ne02 * sizeof(float_t));
2227
- void* sin_buffer = sin_allocator.get();
2228
- void* cos_buffer = cos_allocator.get();
2229
-
2230
- int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
2231
- size_t sin_reshape_nb[GGML_MAX_DIMS];
2232
- sin_reshape_nb[0] = sizeof(float_t);
2233
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2234
- sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
2235
- }
2236
- aclTensor* acl_sin_reshape_tensor =
2237
- ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t),
2238
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2239
- aclTensor* acl_cos_reshape_tensor =
2240
- ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
2241
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2242
- aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2243
- theta_scale, freq_scale, attn_factor, is_neox);
2244
-
2245
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2246
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2247
-
2248
- #ifdef ASCEND_310P
2249
- // Special ROPE operation for 310P
2250
-
2251
- // roll input
2252
- void* input_roll_buffer;
2253
- aclTensor* acl_minus_one_tensor;
2254
- void* minus_one_scale_buffer = nullptr;
2255
- ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
2256
- ggml_cann_pool_alloc minus_one_scale_allocator(
2257
- ctx.pool(), sizeof(float_t) * src0->ne[0]);
2258
- if (!is_neox) {
2259
- // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2260
- input_roll_buffer = roll_allocator.get();
2261
- int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
2262
- src0->ne[2], src0->ne[3]};
2263
- size_t input_roll_nb[GGML_MAX_DIMS];
2264
- input_roll_nb[0] = ggml_type_size(src0->type);
2265
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2266
- input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
2267
- }
2268
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2269
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2270
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2271
- GGML_MAX_DIMS);
2272
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2273
- src0->data, ggml_cann_type_mapping(src0->type),
2274
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2275
- GGML_MAX_DIMS);
2276
-
2277
- int64_t shifts[] = {1};
2278
- int64_t dims[] = {3};
2279
- aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2280
- ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
2281
-
2282
- // init [-1, 1, -1, 1, ...]
2283
- minus_one_scale_buffer = minus_one_scale_allocator.get();
2284
-
2285
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2286
- size_t minus_one_nb[GGML_MAX_DIMS];
2287
- minus_one_nb[0] = sizeof(float_t);
2288
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2289
- minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2290
- }
2291
- acl_minus_one_tensor = aclnn_values(
2292
- ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2293
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2294
- int64_t dim = 3;
2295
- int64_t* index = new int64_t[src0->ne[0]];
2296
- for (int i = 0; i < src0->ne[0]; i++) {
2297
- index[i] = i / 2 * 2;
2298
- }
2299
- int64_t index_num = src0->ne[0];
2300
- float value = -1;
2301
- aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
2302
- index_num, value);
2303
- } else {
2304
- // roll input: [q0,q1,q2,...] ->
2305
- // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2306
- input_roll_buffer = roll_allocator.get();
2307
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2308
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2309
- ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
2310
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
2311
-
2312
- int64_t shifts[] = {src0->ne[0] / 2};
2313
- int64_t dims[] = {3};
2314
- aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2315
-
2316
- ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
2317
- // init [-1, -1, -1, 1, 1,1,...]
2318
- minus_one_scale_buffer = minus_one_scale_allocator.get();
2319
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2320
- size_t minus_one_nb[GGML_MAX_DIMS];
2321
- minus_one_nb[0] = sizeof(float_t);
2322
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2323
- minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2324
- }
2325
- acl_minus_one_tensor = aclnn_values(
2326
- ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2327
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2328
- // -1 * first half
2329
- int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
2330
- size_t first_half_nb[GGML_MAX_DIMS];
2331
- first_half_nb[0] = sizeof(float_t);
2332
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2333
- first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
2334
- }
2335
- aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
2336
- minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
2337
- first_half_nb, GGML_MAX_DIMS);
2338
- bool inplace = true;
2339
- float scale = -1;
2340
- aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
2341
- ggml_cann_release_resources(ctx, acl_first_half_tensor);
2342
- }
2343
-
2344
- // TODO: n_dims < ne0
2345
- GGML_ASSERT(n_dims == src0->ne[0]);
2346
-
2347
- // input * scale
2348
- ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
2349
- ggml_nbytes(src0));
2350
- void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
2351
- size_t input_nb[GGML_MAX_DIMS];
2352
- input_nb[0] = ggml_type_size(src0->type);
2353
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2354
- input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
2355
- }
2356
- aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
2357
- input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
2358
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2359
- aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
2360
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2361
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2362
-
2363
- aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
2364
- acl_input_roll_mul_scale_tensor);
2365
-
2366
- // output
2367
- void* output_fp32_buffer;
2368
- if (src0->type == GGML_TYPE_F32) {
2369
- aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor);
2370
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor,
2371
- acl_sin_reshape_tensor);
2372
- aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
2373
- // TODO: ne0 != n_dims in mode2
2374
- } else if (src0->type == GGML_TYPE_F16) {
2375
- size_t input_fp32_nb[GGML_MAX_DIMS];
2376
- input_fp32_nb[0] = sizeof(float_t);
2377
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2378
- input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
2379
- }
2380
- ggml_cann_pool_alloc fp32_allocator1(
2381
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2382
- void* input_fp32_buffer1 = fp32_allocator1.get();
2383
- aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
2384
- input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
2385
- input_fp32_nb, GGML_MAX_DIMS);
2386
- ggml_cann_pool_alloc fp32_allocator2(
2387
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2388
- void* input_fp32_buffer2 = fp32_allocator2.get();
2389
- aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
2390
- input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
2391
- input_fp32_nb, GGML_MAX_DIMS);
2392
-
2393
- ggml_cann_pool_alloc fp32_allocator(
2394
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2395
- output_fp32_buffer = fp32_allocator.get();
2396
- aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
2397
- output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
2398
- input_fp32_nb, GGML_MAX_DIMS);
2399
- aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
2400
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
2401
- input_fp32_tensor2);
2402
- aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
2403
- output_fp32_tensor);
2404
- aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
2405
-
2406
- ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2,
2407
- output_fp32_tensor, acl_sin_reshape_tensor,
2408
- acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
2409
- acl_input_roll_reshape_tensor, acl_src);
2410
- }
2411
- return;
2412
- #endif
2413
-
2414
- // ggml_mode = 0 --> aclnn_model = 1
2415
- int64_t acl_mode = mode == 0 ? 1 : mode;
2416
-
2417
- switch (src0->type) {
2418
- case GGML_TYPE_F32: {
2419
- GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src,
2420
- acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst);
2421
- break;
2422
- }
2423
- case GGML_TYPE_F16: {
2424
- ggml_cann_pool_alloc src_trans_allocator(
2425
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
2426
- void* src_trans_buffer = src_trans_allocator.get();
2427
- ggml_cann_pool_alloc dst_trans_allocator(
2428
- ctx.pool(), ggml_nelements(dst) * sizeof(float));
2429
- void* dst_trans_buffer = dst_trans_allocator.get();
2430
-
2431
- size_t src_trans_nb[GGML_MAX_DIMS];
2432
- src_trans_nb[0] = sizeof(float);
2433
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2434
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
2435
- }
2436
-
2437
- aclTensor* acl_src_trans_tensor = ggml_cann_create_tensor(
2438
- src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb,
2439
- GGML_MAX_DIMS);
2440
- aclTensor* acl_dst_trans_tensor = ggml_cann_create_tensor(
2441
- dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb,
2442
- GGML_MAX_DIMS);
2443
-
2444
- aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
2445
-
2446
- GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor,
2447
- acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
2448
- acl_dst_trans_tensor);
2449
-
2450
- aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
2451
-
2452
- ggml_cann_release_resources(ctx, acl_src_trans_tensor,
2453
- acl_dst_trans_tensor);
2454
- break;
2455
- }
2456
- default:
2457
- GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
2458
- break;
2459
- }
2460
- ggml_cann_release_resources(ctx, acl_cos_reshape_tensor,
2461
- acl_sin_reshape_tensor, acl_src, acl_dst);
2462
- }
2463
-
2464
-
2465
- void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2466
- ggml_tensor * src0 = dst->src[0];
2467
-
2468
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2469
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2470
-
2471
- GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst);
2472
-
2473
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
2474
- }
2475
-
2476
- void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2477
- ggml_tensor * src0 = dst->src[0];
2478
- ggml_tensor * src1 = dst->src[1];
2479
-
2480
- // stride
2481
- int64_t s0 = ((const int32_t*)(dst->op_params))[0];
2482
-
2483
- aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2484
- aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
2485
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
2486
-
2487
- int64_t strideVal[1];
2488
- strideVal[0] = s0;
2489
- aclIntArray *stride = aclCreateIntArray(strideVal, 1);
2490
- int64_t paddingVal[] = {0};
2491
- aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
2492
- int64_t dilationVal[] = {1};
2493
- aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
2494
- bool transposed = true;
2495
- int64_t groups = 1;
2496
- int8_t cubeMathType = 0;
2497
-
2498
- #ifdef ASCEND_310P
2499
- cubeMathType = 1;
2500
- #endif
2501
-
2502
- GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
2503
- padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
2504
-
2505
- ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
2506
- }
2507
-
2508
- void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2509
- ggml_tensor * src0 = dst->src[0];
2510
-
2511
- aclTensor* acl_input = ggml_cann_create_tensor(src0);
2512
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2513
-
2514
- float alphaValue = 1.0f;
2515
- aclScalar* alpha = nullptr;
2516
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2517
-
2518
- GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha,
2519
- acl_dst);
2520
-
2521
- ggml_cann_release_resources(ctx, acl_input, acl_dst, alpha);
2522
- }
2523
-
2524
- void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2525
- ggml_tensor * src0 = dst->src[0];
2526
-
2527
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2528
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2529
-
2530
- int64_t reduceDimValue[] = {3};
2531
- aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1);
2532
- bool keepDim = true;
2533
-
2534
- GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
2535
-
2536
- ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim);
2537
- }
2538
-
2539
- void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2540
- ggml_tensor * src0 = dst->src[0];
2541
- int32_t *opts = (int32_t *) dst->op_params;
2542
- int64_t paddingsArray[2] = {opts[0], opts[1]};
2543
- aclIntArray* paddings = aclCreateIntArray(paddingsArray, 2);
2544
-
2545
- for (int64_t i = 0; i < src0->ne[3]; i++) {
2546
- aclTensor* acl_src = ggml_cann_create_tensor(
2547
- (char*)src0->data + i * src0->ne[3],
2548
- ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
2549
- src0->ne, src0->nb, 3);
2550
-
2551
- aclTensor* acl_dst = ggml_cann_create_tensor(
2552
- (char*)dst->data + i * src0->ne[3],
2553
- ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2554
- dst->ne, dst->nb, 3);
2555
-
2556
- GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst);
2557
-
2558
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
2559
- }
2560
- ggml_cann_release_resources(ctx, paddings);
2561
- }
2562
-
2563
- void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2564
- ggml_tensor * src0 = dst->src[0];
2565
- ggml_tensor * src1 = dst->src[1];
2566
-
2567
- aclTensor* acl_self = ggml_cann_create_tensor(src0);
2568
- aclTensor* acl_other = ggml_cann_create_tensor(src1);
2569
-
2570
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other);
2571
-
2572
- ggml_cann_sum(ctx, dst);
2573
-
2574
- ggml_cann_release_resources(ctx, acl_self, acl_other);
2575
- }
2576
-
2577
- void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2578
- ggml_tensor * src0 = dst->src[0];
2579
-
2580
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2581
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2582
-
2583
- float alphaValue = 0.0f;
2584
- aclScalar* alpha = nullptr;
2585
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2586
-
2587
- GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst);
2588
-
2589
- ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
2590
- }
2591
-
2592
- /**
2593
- * @brief Performs expert-specific matrix multiplication (MoE) with
2594
- * floating-point precision using the CANN backend.
2595
- *
2596
- * This function executes a matrix multiplication operation tailored for
2597
- * Mixture of Experts (MoE) models, where the input tensor is multiplied
2598
- * with expert-specific weight matrices. It uses the CANN backend for
2599
- * efficient computation and stores the result in the destination tensor `dst`.
2600
- * The operation may leverage identity-based optimizations or routing masks
2601
- * as part of sparse expert selection.
2602
- *
2603
- * @param ctx The context for executing CANN backend operations.
2604
- * @param dst The destination tensor where the MoE multiplication result
2605
- * will be stored.
2606
- *
2607
- * @note This function assumes floating-point data types and is designed for
2608
- * MoE architectures, possibly involving sparse expert routing.
2609
- */
2610
- static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2611
- //dst [M, K, N, 1]
2612
- ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2613
- ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2614
- ggml_tensor * ids = dst->src[2]; //ids [K, N]
2615
-
2616
- GGML_TENSOR_BINARY_OP_LOCALS
2617
-
2618
- // copy index from npu to cpu
2619
- int64_t n_as = ne02; // A
2620
- int64_t n_ids = ids->ne[0]; // K
2621
-
2622
- std::vector<char> ids_host(ggml_nbytes(ids));
2623
- ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2624
- ACL_MEMCPY_DEVICE_TO_HOST);
2625
- ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2626
-
2627
- char * src0_original = (char *) src0->data;
2628
- char * src1_original = (char *) src1->data;
2629
- char * dst_original = (char *) dst->data;
2630
- size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
2631
-
2632
- // src0 is F16, src1 is F32, dst is F32
2633
- ggml_cann_pool_alloc src0_cast_allocator;
2634
- if (src0->type == GGML_TYPE_F16) {
2635
- src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
2636
- void* src0_cast_buf = src0_cast_allocator.get();
2637
-
2638
- size_t cast_nb[GGML_MAX_DIMS];
2639
- cast_nb[0] = sizeof(float_t);
2640
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2641
- cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
2642
- }
2643
-
2644
- aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
2645
- aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
2646
- ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
2647
- GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
2648
- ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
2649
-
2650
- src0_original = (char *) src0_cast_buf;
2651
- memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
2652
- }
2653
-
2654
- std::vector<aclTensor*> src0_tensor_vec;
2655
- std::vector<aclTensor*> src1_tensor_vec;
2656
- std::vector<aclTensor*> dst_tensor_vec;
2657
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2658
- for (int64_t id = 0; id < n_ids; id++) {
2659
- // src0_row [M, D] -> weight && permute
2660
- int64_t src0_ne[2] = {ne01, ne00};
2661
- size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
2662
- // src1_row [D, 1] -> input
2663
- int64_t src1_ne[2] = {ne10, 1};
2664
- size_t src1_nb[2] = {nb10, nb11};
2665
- // dst_row [M, 1] -> out
2666
- int64_t dst_ne[2] = {ne0, 1};
2667
- size_t dst_nb[2] = {nb0, nb1};
2668
-
2669
- // expert index
2670
- int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2671
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
2672
-
2673
- // If B = 1 (broadcast), always use 0; otherwise, use id.
2674
- int64_t i11 = (ne11 == 1 ? 0 : id);
2675
- int64_t i12 = iid1;
2676
-
2677
- int64_t i1 = id;
2678
- int64_t i2 = i12;
2679
-
2680
- void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2681
- void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2682
- void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2683
-
2684
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
2685
- ACL_FLOAT, sizeof(float),
2686
- src0_ne, src0_nb, 2);
2687
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
2688
- ACL_FLOAT, sizeof(float),
2689
- src1_ne, src1_nb, 2);
2690
- aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
2691
- ACL_FLOAT, sizeof(float),
2692
- dst_ne, dst_nb, 2);
2693
-
2694
- src0_tensor_vec.push_back(acl_src0);
2695
- src1_tensor_vec.push_back(acl_src1);
2696
- dst_tensor_vec.push_back(acl_dst);
2697
- }
2698
- }
2699
-
2700
- size_t GROUP_SIZE = 128;
2701
- // GroupedMatmulV2 required tensor_list.size < 128
2702
- for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
2703
- // split and call GroupedMatmulV2
2704
- size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
2705
- std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
2706
- std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
2707
- std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
2708
-
2709
- aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
2710
- aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
2711
- aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
2712
-
2713
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
2714
- nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
2715
-
2716
- ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
2717
- }
2718
- return;
2719
- }
2720
-
2721
- /**
2722
- * @brief Performs expert-specific matrix multiplication (MoE) with
2723
- * quantized precision using the CANN backend.
2724
- *
2725
- * This function executes a matrix multiplication operation tailored for
2726
- * Mixture of Experts (MoE) models, where the input tensor is multiplied
2727
- * with expert-specific quantized weight matrices. It leverages the CANN
2728
- * backend to perform efficient low-precision computations and stores the
2729
- * quantized result in the destination tensor `dst`.
2730
- *
2731
- * Quantization techniques reduce memory footprint and improve performance
2732
- * by using lower-bit representations (e.g., int8) instead of floating-point.
2733
- * This function is designed to work with such formats and may incorporate
2734
- * optimizations like identity-based fast paths or routing masks for sparse
2735
- * expert selection.
2736
- *
2737
- * @param ctx The context for executing CANN backend operations.
2738
- * @param dst The destination tensor where the quantized MoE multiplication result
2739
- * will be stored.
2740
- *
2741
- * @note This function assumes quantized data types and is designed for
2742
- * MoE architectures with potential sparse expert routing.
2743
- */
2744
- static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2745
- // TODO: Use aclnnGroupedMatMul
2746
- //dst [M, K, N, 1]
2747
- ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2748
- ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2749
- ggml_tensor * ids = dst->src[2]; //ids [K, N]
2750
-
2751
- GGML_TENSOR_BINARY_OP_LOCALS
2752
-
2753
- // copy index from npu to cpu
2754
- int64_t n_as = ne02; // A
2755
- int64_t n_ids = ids->ne[0]; // K
2756
-
2757
- std::vector<char> ids_host(ggml_nbytes(ids));
2758
- ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2759
- ACL_MEMCPY_DEVICE_TO_HOST);
2760
- ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2761
-
2762
- char * src0_original = (char *) src0->data;
2763
- char * src1_original = (char *) src1->data;
2764
- char * dst_original = (char *) dst->data;
2765
-
2766
- ggml_tensor src0_row = *src0;
2767
- ggml_tensor src1_row = *src1;
2768
- ggml_tensor dst_row = *dst;
2769
-
2770
- const enum ggml_type type = dst->src[0]->type;
2771
- float weight_elem_size;
2772
- if (type == GGML_TYPE_Q4_0) {
2773
- weight_elem_size = float(sizeof(uint8_t)) / 2;
2774
- } else if (type == GGML_TYPE_Q8_0) {
2775
- weight_elem_size = float(sizeof(uint8_t));
2776
- } else {
2777
- GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
2778
- }
2779
-
2780
- // src0_row [D, M, 1, 1] weight without permute
2781
- src0_row.ne[2] = 1;
2782
- src0_row.ne[3] = 1;
2783
- src0_row.nb[0] = weight_elem_size;
2784
- src0_row.nb[1] = weight_elem_size * ne00;
2785
- src0_row.nb[2] = weight_elem_size * ne00;
2786
- src0_row.nb[3] = weight_elem_size * ne00;
2787
- size_t weight_stride = ne00 * ne01 * weight_elem_size;
2788
- size_t weight_size = weight_stride * ne02 * ne03;
2789
-
2790
- // scale [D, M, 1, 1] -> scale && permute
2791
- size_t scale_elem_size = sizeof(uint16_t);
2792
- size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2793
-
2794
- // src1_row [D, 1, 1, 1] -> input
2795
- src1_row.ne[1] = 1;
2796
- src1_row.ne[2] = 1;
2797
- src1_row.ne[3] = 1;
2798
- src1_row.nb[2] = nb11;
2799
- src1_row.nb[3] = nb11;
2800
-
2801
- // dst_row [M, 1, 1, 1] -> out
2802
- dst_row.ne[1] = 1;
2803
- dst_row.ne[2] = 1;
2804
- dst_row.ne[3] = 1;
2805
- dst_row.nb[2] = nb1;
2806
- dst_row.nb[3] = nb1;
2807
-
2808
- //create weight for one row
2809
- ggml_cann_pool_alloc weight_allocator(ctx.pool());
2810
- void* weight_buffer = weight_allocator.alloc(nb02);
2811
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2812
- for (int64_t id = 0; id < n_ids; id++) {
2813
- // expert index
2814
- int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2815
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
2816
-
2817
- // If B = 1 (broadcast), always use 0; otherwise, use id.
2818
- int64_t i11 = (ne11 == 1 ? 0 : id);
2819
- int64_t i12 = iid1;
2820
-
2821
- int64_t i1 = id;
2822
- int64_t i2 = i12;
2823
-
2824
- void* src0_tmp_ptr = src0_original + i02*weight_stride;
2825
- void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
2826
- void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2827
- void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2828
-
2829
- // mem cpy
2830
- ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
2831
- ACL_MEMCPY_DEVICE_TO_DEVICE);
2832
- void* scale_buffer = (char*)weight_buffer + weight_stride;
2833
- ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
2834
- ACL_MEMCPY_DEVICE_TO_DEVICE);
2835
-
2836
- src0_row.data = weight_buffer;
2837
- src1_row.data = src1_tmp_ptr;
2838
- dst_row.data = dst_tmp_ptr;
2839
- dst_row.src[0] = &src0_row;
2840
- dst_row.src[1] = &src1_row;
2841
-
2842
- ggml_cann_mul_mat(ctx, &dst_row);
2843
- }
2844
- }
2845
- return;
2846
- }
2847
-
2848
- void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2849
- const enum ggml_type type = dst->src[0]->type;
2850
- switch (type) {
2851
- case GGML_TYPE_F32:
2852
- case GGML_TYPE_F16:
2853
- ggml_cann_mul_mat_id_fp(ctx, dst);
2854
- break;
2855
- case GGML_TYPE_Q4_0:
2856
- case GGML_TYPE_Q8_0:
2857
- ggml_cann_mul_mat_id_quant(ctx, dst);
2858
- break;
2859
- default:
2860
- GGML_ABORT("Unsupported type for mul_mat_id");
2861
- break;
2862
- }
2863
- }