@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,2944 @@
1
+ /*
2
+ * Copyright (c) 2023-2024 The ggml authors
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ * of this software and associated documentation files (the "Software"), to
6
+ * deal in the Software without restriction, including without limitation the
7
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8
+ * sell copies of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in
12
+ * all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20
+ * IN THE SOFTWARE.
21
+ */
22
+
23
+ #include "aclnn_ops.h"
24
+
25
+ #include <aclnnop/aclnn_avgpool2d.h>
26
+ #include <aclnnop/aclnn_cast.h>
27
+ #include <aclnnop/aclnn_constant_pad_nd.h>
28
+ #include <aclnnop/aclnn_copy.h>
29
+ #include <aclnnop/aclnn_cos.h>
30
+ #include <aclnnop/aclnn_exp.h>
31
+ #include <aclnnop/aclnn_fill_scalar.h>
32
+ #include <aclnnop/aclnn_group_norm.h>
33
+ #include <aclnnop/aclnn_index_fill_tensor.h>
34
+ #include <aclnnop/aclnn_layer_norm.h>
35
+ #include <aclnnop/aclnn_matmul.h>
36
+ #include <aclnnop/aclnn_max_pool.h>
37
+ #include <aclnnop/aclnn_permute.h>
38
+ #include <aclnnop/aclnn_pow_tensor_tensor.h>
39
+ #include <aclnnop/aclnn_reduce_sum.h>
40
+ #include <aclnnop/aclnn_repeat.h>
41
+ #include <aclnnop/aclnn_repeat_interleave.h>
42
+ #include <aclnnop/aclnn_roll.h>
43
+ #include <aclnnop/aclnn_sin.h>
44
+ #include <aclnnop/aclnn_softmax.h>
45
+ #include <aclnnop/aclnn_tril.h>
46
+ #include <aclnnop/aclnn_triu.h>
47
+ #include <aclnnop/aclnn_upsample_nearest_2d.h>
48
+ #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
49
+ #include <float.h>
50
+
51
+ #include <cmath>
52
+ #include <cstring>
53
+ #include <exception>
54
+ #include <vector>
55
+
56
+ #include "kernels/ascendc_kernels.h"
57
+
58
+ #define GGML_COMMON_DECL_C
59
+
60
+ #include "../ggml-common.h"
61
+
62
+ /**
63
+ * @brief Repeats elements of a tensor along each dimension according to the
64
+ * specified repeat array.
65
+ *
66
+ * @param ctx The context for the CANN backend operations.
67
+ * @param acl_src The source tensor to be repeated.
68
+ * @param acl_dst The destination tensor after repeating.
69
+ * @param repeat_array The array specifying the number of repetitions along each
70
+ * dimension.
71
+ */
72
+ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
73
+ aclTensor* acl_dst, int64_t* repeat_array) {
74
+ // repeat tensor along each dim with repeat_array
75
+ aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
76
+
77
+ uint64_t workspaceSize = 0;
78
+ aclOpExecutor* executor;
79
+ void* workspaceAddr = nullptr;
80
+
81
+ ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
82
+ &workspaceSize, &executor));
83
+
84
+ if (workspaceSize > 0) {
85
+ // Memory from allocator will "free" immediately, and this memory
86
+ // will be alloced to other pointers, but it won't access before
87
+ // this async task end because all tasks in same stream will execute
88
+ // in queue.
89
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
90
+ workspaceAddr = workspace_allocator.get();
91
+ }
92
+ ACL_CHECK(
93
+ aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream()));
94
+ ACL_CHECK(aclDestroyIntArray(repeats));
95
+ }
96
+
97
+ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
98
+ ggml_tensor* src = dst->src[0];
99
+ GGML_ASSERT(ggml_can_repeat(src, dst));
100
+
101
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
102
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
103
+
104
+ int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
105
+ dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
106
+
107
+ aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
108
+ ACL_CHECK(aclDestroyTensor(acl_src));
109
+ ACL_CHECK(aclDestroyTensor(acl_dst));
110
+ }
111
+
112
+ /**
113
+ * @brief Adds two tensors element-wise and stores the result in a destination
114
+ * tensor.
115
+ *
116
+ * This function performs the operation:
117
+ * \f[
118
+ * dst = acl\_src0 + alpha \times acl\_src1
119
+ * \f]
120
+ * where alpha is a scalar value and defaults to 1.0f.
121
+ *
122
+ * @param ctx The context for the CANN backend operations.
123
+ * @param acl_src0 The first source tensor.
124
+ * @param acl_src1 The second source tensor.
125
+ * @param acl_dst The destination tensor where the result will be stored.
126
+ */
127
+ static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
128
+ aclTensor* acl_src1, aclTensor* acl_dst) {
129
+ aclScalar* alpha = nullptr;
130
+ float alphaValue = 1.0f;
131
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
132
+
133
+ uint64_t workspaceSize = 0;
134
+ aclOpExecutor* executor;
135
+ void* workspaceAddr = nullptr;
136
+
137
+ ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
138
+ &workspaceSize, &executor));
139
+ if (workspaceSize > 0) {
140
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
141
+ workspaceAddr = workspace_allocator.get();
142
+ }
143
+
144
+ ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
145
+
146
+ ACL_CHECK(aclDestroyScalar(alpha));
147
+ }
148
+
149
+ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
150
+ ggml_tensor* src0 = dst->src[0];
151
+ ggml_tensor* src1 = dst->src[1];
152
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
153
+
154
+ aclTensor* acl_src0;
155
+ aclTensor* acl_src1;
156
+ aclTensor* acl_dst;
157
+
158
+ // Need bcast
159
+ if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
160
+ BCAST_SHAPE(src0, src1)
161
+ acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
162
+ acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
163
+ acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
164
+ } else {
165
+ acl_src0 = ggml_cann_create_tensor(src0);
166
+ acl_src1 = ggml_cann_create_tensor(src1);
167
+ acl_dst = ggml_cann_create_tensor(dst);
168
+ }
169
+
170
+ aclnn_add(ctx, acl_src0, acl_src1, acl_dst);
171
+
172
+ ACL_CHECK(aclDestroyTensor(acl_src0));
173
+ ACL_CHECK(aclDestroyTensor(acl_src1));
174
+ ACL_CHECK(aclDestroyTensor(acl_dst));
175
+ }
176
+
177
+ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
178
+ ggml_tensor* src = dst->src[0];
179
+
180
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
181
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
182
+
183
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
184
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
185
+
186
+ float negative_slope;
187
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
188
+ aclScalar* acl_negative_slope =
189
+ aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
190
+
191
+ uint64_t workspaceSize = 0;
192
+ aclOpExecutor* executor;
193
+ void* workspaceAddr = nullptr;
194
+
195
+ ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
196
+ acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
197
+ if (workspaceSize > 0) {
198
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
199
+ workspaceAddr = workspace_allocator.get();
200
+ }
201
+
202
+ ACL_CHECK(
203
+ aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream()));
204
+
205
+ ACL_CHECK(aclDestroyScalar(acl_negative_slope));
206
+ ACL_CHECK(aclDestroyTensor(acl_src));
207
+ ACL_CHECK(aclDestroyTensor(acl_dst));
208
+ }
209
+
210
+ /**
211
+ * @brief Concatenates a list of tensors along a specified dimension and stores
212
+ * the result in a destination tensor.
213
+ *
214
+ * @param ctx The context for the CANN backend operations.
215
+ * @param tensorList The list of tensors to be concatenated.
216
+ * @param acl_dst The destination tensor where the concatenated result will be
217
+ * stored.
218
+ * @param concat_dim The dimension along which the tensors will be concatenated.
219
+ */
220
+ static void aclnn_concat(ggml_backend_cann_context& ctx,
221
+ aclTensorList* tensorList, aclTensor* acl_dst,
222
+ int64_t concat_dim) {
223
+ uint64_t workspaceSize = 0;
224
+ aclOpExecutor* executor;
225
+ void* workspaceAddr = nullptr;
226
+
227
+ ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
228
+ &workspaceSize, &executor));
229
+ if (workspaceSize > 0) {
230
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
231
+ workspaceAddr = workspace_allocator.get();
232
+ }
233
+
234
+ ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream()));
235
+ }
236
+
237
+ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
238
+ ggml_tensor* src0 = dst->src[0];
239
+ ggml_tensor* src1 = dst->src[1];
240
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
241
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
242
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
243
+
244
+ int64_t concat_dim = 1;
245
+ aclTensor* tensors[] = {acl_src0, acl_src1};
246
+ aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
247
+ aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
248
+
249
+ ACL_CHECK(aclDestroyTensorList(tensorList));
250
+ ACL_CHECK(aclDestroyTensor(acl_dst));
251
+ }
252
+
253
+ /**
254
+ * @brief Creates a tensor with values starting from `start`, incremented by
255
+ * `step`, and ending before `stop`.
256
+ *
257
+ * This function performs the operation:
258
+ * \f[
259
+ * \text {out }_{i+1}=\text {out }_i+\text {step}
260
+ * \f]
261
+ * the range is [start, stop).
262
+ *
263
+ * @param ctx The context for the CANN backend operations.
264
+ * @param acl_dst The destination tensor where the values will be stored.
265
+ * @param start The starting value of the range.
266
+ * @param stop The ending value of the range (exclusive).
267
+ * @param step The step size between consecutive values.
268
+ * @param n_elements The number of elements in the destination tensor.
269
+ */
270
+ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
271
+ float start, float stop, float step,
272
+ int64_t n_elements) {
273
+ int64_t steps = (int64_t)std::ceil((stop - start) / step);
274
+ GGML_ASSERT(n_elements == steps);
275
+
276
+ uint64_t workspaceSize = 0;
277
+ aclOpExecutor* executor;
278
+ void* workspaceAddr = nullptr;
279
+
280
+ aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
281
+ aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
282
+ aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
283
+
284
+ ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
285
+ &workspaceSize, &executor));
286
+ if (workspaceSize > 0) {
287
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
288
+ workspaceAddr = workspace_allocator.get();
289
+ }
290
+
291
+ ACL_CHECK(
292
+ aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream()));
293
+
294
+ ACL_CHECK(aclDestroyScalar(acl_start));
295
+ ACL_CHECK(aclDestroyScalar(acl_end));
296
+ ACL_CHECK(aclDestroyScalar(acl_step));
297
+ }
298
+
299
+ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
300
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
301
+
302
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
303
+
304
+ int64_t n_elements = ggml_nelements(dst);
305
+ float start;
306
+ float stop;
307
+ float step;
308
+ memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
309
+ memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
310
+ memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
311
+
312
+ aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
313
+ ACL_CHECK(aclDestroyTensor(acl_dst));
314
+ }
315
+
316
+ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
317
+ dst->src[1] = dst->src[0];
318
+ ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
319
+ }
320
+
321
+ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
322
+ ggml_tensor* src = dst->src[0];
323
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
324
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
325
+
326
+ float min;
327
+ float max;
328
+ memcpy(&min, dst->op_params, sizeof(float));
329
+ memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
330
+
331
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
332
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
333
+
334
+ aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
335
+ aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
336
+
337
+ uint64_t workspaceSize = 0;
338
+ aclOpExecutor* executor;
339
+ void* workspaceAddr = nullptr;
340
+
341
+ ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
342
+ &workspaceSize, &executor));
343
+ if (workspaceSize > 0) {
344
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
345
+ workspaceAddr = workspace_allocator.get();
346
+ }
347
+
348
+ ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream()));
349
+
350
+ ACL_CHECK(aclDestroyScalar(acl_min));
351
+ ACL_CHECK(aclDestroyScalar(acl_max));
352
+ ACL_CHECK(aclDestroyTensor(acl_src));
353
+ ACL_CHECK(aclDestroyTensor(acl_dst));
354
+ }
355
+
356
+ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
357
+ ggml_tensor* src = dst->src[0];
358
+
359
+ // scale factor
360
+ float v;
361
+ memcpy(&v, dst->op_params, sizeof(float));
362
+
363
+ aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
364
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
365
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
366
+
367
+ uint64_t workspaceSize = 0;
368
+ aclOpExecutor* executor;
369
+ void* workspaceAddr = nullptr;
370
+
371
+ ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
372
+ &executor));
373
+ if (workspaceSize > 0) {
374
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
375
+ workspaceAddr = workspace_allocator.get();
376
+ }
377
+
378
+ ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
379
+
380
+ ACL_CHECK(aclDestroyScalar(scale));
381
+ ACL_CHECK(aclDestroyTensor(acl_src));
382
+ ACL_CHECK(aclDestroyTensor(acl_dst));
383
+ }
384
+
385
+ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
386
+ ggml_tensor* src = dst->src[0];
387
+ enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
388
+
389
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
390
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
391
+ ggml_cann_pool_alloc temp_buffer_allocator(
392
+ ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
393
+ void* buffer = temp_buffer_allocator.get();
394
+ aclTensor* tmp_tensor =
395
+ ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
396
+ dst->ne, dst->nb, GGML_MAX_DIMS);
397
+
398
+ uint64_t workspaceSize = 0;
399
+ aclOpExecutor* executor;
400
+ void* workspaceAddr = nullptr;
401
+
402
+ ACL_CHECK(aclnnArgsortGetWorkspaceSize(
403
+ acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
404
+ &workspaceSize, &executor));
405
+ if (workspaceSize > 0) {
406
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
407
+ workspaceAddr = workspace_allocator.get();
408
+ }
409
+
410
+ ACL_CHECK(
411
+ aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream()));
412
+
413
+ workspaceSize = 0;
414
+ ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor,
415
+ ggml_cann_type_mapping(dst->type),
416
+ acl_dst, &workspaceSize, &executor));
417
+ if (workspaceSize > 0) {
418
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
419
+ workspaceAddr = workspace_allocator.get();
420
+ }
421
+
422
+ ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
423
+
424
+ ACL_CHECK(aclDestroyTensor(acl_src));
425
+ ACL_CHECK(aclDestroyTensor(tmp_tensor));
426
+ ACL_CHECK(aclDestroyTensor(acl_dst));
427
+ }
428
+
429
+ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
430
+ ggml_tensor* src = dst->src[0];
431
+
432
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
433
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
434
+
435
+ float eps;
436
+ memcpy(&eps, dst->op_params, sizeof(float));
437
+
438
+ uint64_t workspaceSize = 0;
439
+ aclOpExecutor* executor;
440
+ void* workspaceAddr = nullptr;
441
+
442
+ std::vector<int64_t> normData = {dst->ne[0]};
443
+ aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
444
+ ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
445
+ eps, acl_dst, nullptr, nullptr,
446
+ &workspaceSize, &executor));
447
+
448
+ if (workspaceSize > 0) {
449
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
450
+ workspaceAddr = workspace_allocator.get();
451
+ }
452
+
453
+ ACL_CHECK(
454
+ aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
455
+
456
+ ACL_CHECK(aclDestroyIntArray(norm));
457
+ ACL_CHECK(aclDestroyTensor(acl_src));
458
+ ACL_CHECK(aclDestroyTensor(acl_dst));
459
+ }
460
+
461
+ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
462
+ ggml_tensor* src = dst->src[0];
463
+
464
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
465
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
466
+
467
+ const float eps = 1e-6f; // TODO: make this a parameter
468
+ int n_groups = dst->op_params[0];
469
+
470
+ uint64_t workspaceSize = 0;
471
+ aclOpExecutor* executor;
472
+ void* workspaceAddr = nullptr;
473
+
474
+ int64_t N = src->ne[3];
475
+ int64_t C = src->ne[2];
476
+ int64_t HxW = src->ne[1] * src->ne[0];
477
+
478
+ size_t type_size = ggml_type_size(src->type);
479
+ int64_t ne[] = {n_groups, N};
480
+ size_t nb[] = {type_size, type_size * n_groups};
481
+ size_t n_bytes = N * n_groups;
482
+
483
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
484
+ void* buffer = temp_buffer_allocator.get();
485
+ aclTensor* acl_mean_out = ggml_cann_create_tensor(
486
+ buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
487
+ aclTensor* acl_rstd_out = ggml_cann_create_tensor(
488
+ (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
489
+
490
+ ACL_CHECK(aclnnGroupNormGetWorkspaceSize(
491
+ acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst,
492
+ acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
493
+
494
+ if (workspaceSize > 0) {
495
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
496
+ workspaceAddr = workspace_allocator.get();
497
+ }
498
+
499
+ ACL_CHECK(
500
+ aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
501
+
502
+ ACL_CHECK(aclDestroyTensor(acl_src));
503
+ ACL_CHECK(aclDestroyTensor(acl_dst));
504
+ ACL_CHECK(aclDestroyTensor(acl_mean_out));
505
+ ACL_CHECK(aclDestroyTensor(acl_rstd_out));
506
+ }
507
+
508
+ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
509
+ ggml_tensor* src0 = dst->src[0];
510
+ ggml_tensor* src1 = dst->src[1];
511
+
512
+ size_t nb1 = ((int32_t*)dst->op_params)[0];
513
+ size_t nb2 = ((int32_t*)dst->op_params)[1];
514
+ size_t nb3 = ((int32_t*)dst->op_params)[2];
515
+ size_t offset = ((int32_t*)dst->op_params)[3];
516
+ bool inplace = (bool)((int32_t*)dst->op_params)[4];
517
+
518
+ size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
519
+
520
+ aclTensor* acl_dst = ggml_cann_create_tensor(
521
+ dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
522
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
523
+
524
+ aclScalar* alpha = nullptr;
525
+ float alphaValue = 1.0f;
526
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
527
+
528
+ uint64_t workspaceSize = 0;
529
+ aclOpExecutor* executor;
530
+ void* workspaceAddr = nullptr;
531
+
532
+ if (!inplace) {
533
+ size_t cpy_size = ggml_nbytes(dst);
534
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
535
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
536
+ aclTensor* acl_src0 = ggml_cann_create_tensor(
537
+ src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
538
+ ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
539
+ &workspaceSize, &executor));
540
+ if (workspaceSize > 0) {
541
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
542
+ workspaceAddr = workspace_allocator.get();
543
+ }
544
+ ACL_CHECK(
545
+ aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
546
+ ACL_CHECK(aclDestroyTensor(acl_src0));
547
+ } else {
548
+ ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
549
+ &workspaceSize, &executor));
550
+ if (workspaceSize > 0) {
551
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
552
+ workspaceAddr = workspace_allocator.get();
553
+ }
554
+ ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
555
+ ctx.stream()));
556
+ }
557
+
558
+ ACL_CHECK(aclDestroyTensor(acl_src1));
559
+ ACL_CHECK(aclDestroyTensor(acl_dst));
560
+ }
561
+
562
+ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
563
+ ggml_tensor* src = dst->src[0];
564
+
565
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
566
+
567
+ GGML_ASSERT(dst->ne[0] == 1);
568
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
569
+
570
+ int64_t reduce_dims_host[] = {3};
571
+ aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1);
572
+
573
+ uint64_t workspaceSize = 0;
574
+ aclOpExecutor* executor;
575
+ void* workspaceAddr = nullptr;
576
+
577
+ ACL_CHECK(aclnnReduceSumGetWorkspaceSize(
578
+ acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst,
579
+ &workspaceSize, &executor));
580
+ if (workspaceSize > 0) {
581
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
582
+ workspaceAddr = workspace_allocator.get();
583
+ }
584
+
585
+ ACL_CHECK(
586
+ aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream()));
587
+
588
+ ACL_CHECK(aclDestroyTensor(acl_src));
589
+ ACL_CHECK(aclDestroyTensor(acl_dst));
590
+ }
591
+
592
+ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
593
+ ggml_tensor* dst) {
594
+ ggml_tensor* src = dst->src[0];
595
+ aclTensor* acl_src =
596
+ ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
597
+ aclTensor* acl_dst =
598
+ ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
599
+
600
+ std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
601
+ auto output_size_array = aclCreateIntArray(output_size.data(), 2);
602
+
603
+ uint64_t workspaceSize = 0;
604
+ aclOpExecutor* executor;
605
+ void* workspaceAddr = nullptr;
606
+
607
+ ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
608
+ acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
609
+ if (workspaceSize > 0) {
610
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
611
+ workspaceAddr = workspace_allocator.get();
612
+ }
613
+
614
+ ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor,
615
+ ctx.stream()));
616
+
617
+ ACL_CHECK(aclDestroyIntArray(output_size_array));
618
+ ACL_CHECK(aclDestroyTensor(acl_src));
619
+ ACL_CHECK(aclDestroyTensor(acl_dst));
620
+ }
621
+
622
+ /**
623
+ * @brief Pads a tensor with a specified value along each dimension.
624
+ *
625
+ * This function performs padding of the source tensor `acl_src` and stores the
626
+ * result in the destination tensor `acl_dst`. The padding values for each
627
+ * dimension are specified in the `paddings` array.
628
+ *
629
+ * @param ctx The context for the CANN backend operations.
630
+ * @param acl_src The source tensor to be padded.
631
+ * @param acl_dst The destination tensor where the padded result will be stored.
632
+ * @param paddings An array specifying the padding values for each dimension.
633
+ * The size of the array should be twice the number of dimensions of the tensor.
634
+ * @param value The value to be used for padding. The default value is 0.0.
635
+ */
636
+ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
637
+ aclTensor* acl_dst, int64_t* paddings,
638
+ float value = 0.0f) {
639
+ aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
640
+ aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
641
+
642
+ uint64_t workspaceSize = 0;
643
+ aclOpExecutor* executor;
644
+ void* workspaceAddr = nullptr;
645
+
646
+ ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize(
647
+ acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor));
648
+
649
+ if (workspaceSize > 0) {
650
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
651
+ workspaceAddr = workspace_allocator.get();
652
+ }
653
+
654
+ ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor,
655
+ ctx.stream()));
656
+
657
+ ACL_CHECK(aclDestroyIntArray(acl_pad));
658
+ ACL_CHECK(aclDestroyScalar(acl_value));
659
+ }
660
+
661
+ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
662
+ ggml_tensor* src = dst->src[0];
663
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
664
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
665
+
666
+ // padding: value in the array means how much distance will be padding.
667
+ // the position of elements in the array means which dirction to padding,
668
+ // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
669
+ // dim2.front, dim2.behind, dim3.front, dim3.behind]
670
+ int64_t paddings[] = {
671
+ 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
672
+ 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
673
+ aclnn_pad(ctx, acl_src, acl_dst, paddings);
674
+
675
+ ACL_CHECK(aclDestroyTensor(acl_dst));
676
+ ACL_CHECK(aclDestroyTensor(acl_src));
677
+ }
678
+
679
+ /**
680
+ * @brief Performs 2D average pooling on the input tensor and stores the result
681
+ * in the destination tensor.
682
+ *
683
+ * This function performs average pooling on the source tensor and stores the
684
+ * result in the destination tensor. The pooling parameters (kernel size,
685
+ * strides, padding) are specified in the `op_params` of the destination tensor.
686
+ *
687
+ * @param ctx The context for the CANN backend operations.
688
+ * @param dst The destination tensor where the result will be stored. The source
689
+ * tensor is referenced by `dst->src[0]`.
690
+ */
691
+ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
692
+ ggml_tensor* dst) {
693
+ ggml_tensor* src = dst->src[0];
694
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
695
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
696
+
697
+ aclTensor* acl_src =
698
+ ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
699
+ aclTensor* acl_dst =
700
+ ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
701
+
702
+ const int32_t* opts = (const int32_t*)dst->op_params;
703
+ const int k0 = opts[1];
704
+ const int k1 = opts[2];
705
+ const int s0 = opts[3];
706
+ const int s1 = opts[4];
707
+ const int p0 = opts[5];
708
+ const int p1 = opts[6];
709
+
710
+ std::vector<int64_t> kernel_dims = {k1, k0};
711
+ std::vector<int64_t> stride_dims = {s1, s0};
712
+ std::vector<int64_t> padding_avg_dims = {p1, p0}; // (padH, padW)
713
+
714
+ auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
715
+ auto* strides = aclCreateIntArray(stride_dims.data(), 2);
716
+ auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
717
+
718
+ bool ceil_mode = false;
719
+ bool count_include_pad = true;
720
+ int64_t divisor_override = 0;
721
+ int8_t cube_math_type = 0;
722
+
723
+ uint64_t workspaceSize = 0;
724
+ aclOpExecutor* executor;
725
+ void* workspaceAddr = nullptr;
726
+
727
+ ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
728
+ acl_src, kernel_size, strides, paddings_avg, ceil_mode,
729
+ count_include_pad, divisor_override, cube_math_type, acl_dst,
730
+ &workspaceSize, &executor));
731
+
732
+ if (workspaceSize > 0) {
733
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
734
+ workspaceAddr = workspace_allocator.get();
735
+ }
736
+ ACL_CHECK(
737
+ aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream()));
738
+
739
+ ACL_CHECK(aclDestroyTensor(acl_src));
740
+ ACL_CHECK(aclDestroyTensor(acl_dst));
741
+ ACL_CHECK(aclDestroyIntArray(kernel_size));
742
+ ACL_CHECK(aclDestroyIntArray(strides));
743
+ ACL_CHECK(aclDestroyIntArray(paddings_avg));
744
+ }
745
+
746
+ /**
747
+ * @brief Performs 2D max pooling on the input tensor and stores the result in
748
+ * the destination tensor.
749
+ *
750
+ * This function performs max pooling on the source tensor and stores the result
751
+ * in the destination tensor. The pooling parameters (kernel size, strides,
752
+ * padding) are specified in the `op_params` of the destination tensor.
753
+ *
754
+ * @param ctx The context for the CANN backend operations.
755
+ * @param dst The destination tensor where the result will be stored. The source
756
+ * tensor is referenced by `dst->src[0]`.
757
+ */
758
+ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
759
+ ggml_tensor* dst) {
760
+ ggml_tensor* src = dst->src[0];
761
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
762
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
763
+
764
+ aclTensor* acl_src =
765
+ ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
766
+ aclTensor* acl_dst =
767
+ ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
768
+
769
+ const int32_t* opts = (const int32_t*)dst->op_params;
770
+ const int k0 = opts[1];
771
+ const int k1 = opts[2];
772
+ const int s0 = opts[3];
773
+ const int s1 = opts[4];
774
+ const int p0 = opts[5];
775
+ const int p1 = opts[6];
776
+
777
+ int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
778
+ src->ne[3]};
779
+ size_t temp_nb[GGML_MAX_DIMS];
780
+
781
+ temp_nb[0] = ggml_element_size(src);
782
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
783
+ temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
784
+ }
785
+
786
+ ggml_cann_pool_alloc temp_buffer_allocator(
787
+ ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
788
+ void* buffer = temp_buffer_allocator.get();
789
+ aclTensor* tmp_tensor = ggml_cann_create_tensor(
790
+ buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
791
+ GGML_MAX_DIMS, ACL_FORMAT_NCHW);
792
+
793
+ // pad: see padding in ggml_cann_pad()
794
+ int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
795
+ float value = -FLT_MAX;
796
+ aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
797
+
798
+ // max_pool
799
+ std::vector<int64_t> kernel_dims = {k1, k0};
800
+ std::vector<int64_t> stride_dims = {s1, s0};
801
+ // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
802
+ std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
803
+ std::vector<int64_t> dilation_size = {1, 1};
804
+ auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
805
+ auto* strides = aclCreateIntArray(stride_dims.data(), 2);
806
+ auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
807
+ auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
808
+
809
+ bool ceil_mode = false;
810
+ int64_t auto_pads = 0;
811
+
812
+ uint64_t workspaceSize = 0;
813
+ aclOpExecutor* executor;
814
+ void* workspaceAddr = nullptr;
815
+
816
+ ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
817
+ tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
818
+ ceil_mode, acl_dst, &workspaceSize, &executor));
819
+ if (workspaceSize > 0) {
820
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
821
+ workspaceAddr = workspace_allocator.get();
822
+ }
823
+
824
+ ACL_CHECK(
825
+ aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream()));
826
+
827
+ ACL_CHECK(aclDestroyTensor(acl_src));
828
+ ACL_CHECK(aclDestroyTensor(acl_dst));
829
+ ACL_CHECK(aclDestroyTensor(tmp_tensor));
830
+ ACL_CHECK(aclDestroyIntArray(kernel_size));
831
+ ACL_CHECK(aclDestroyIntArray(strides));
832
+ ACL_CHECK(aclDestroyIntArray(paddings_max));
833
+ ACL_CHECK(aclDestroyIntArray(dilations));
834
+ }
835
+
836
+ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
837
+ const int32_t* opts = (const int32_t*)dst->op_params;
838
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
839
+ switch (op) {
840
+ case GGML_OP_POOL_AVG:
841
+ ggml_cann_avg_pool2d(ctx, dst);
842
+ break;
843
+ case GGML_OP_POOL_MAX:
844
+ ggml_cann_max_pool2d(ctx, dst);
845
+ break;
846
+ case GGML_OP_POOL_COUNT:
847
+ GGML_ABORT("fatal error");
848
+ break;
849
+ }
850
+ }
851
+
852
+ /**
853
+ * @brief Copies data from the source tensor to the destination tensor.
854
+ *
855
+ * This function copies data from the source tensor `acl_src` to the destination
856
+ * tensor `acl_dst`.
857
+ *
858
+ * @param ctx The context for the CANN backend operations.
859
+ * @param acl_src The source tensor from which data will be copied.
860
+ * @param acl_dst The destination tensor where the data will be copied to.
861
+ */
862
+ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
863
+ aclTensor* acl_dst) {
864
+ uint64_t workspaceSize = 0;
865
+ aclOpExecutor* executor;
866
+ void* workspaceAddr = nullptr;
867
+
868
+ ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize,
869
+ &executor));
870
+
871
+ if (workspaceSize > 0) {
872
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
873
+ workspaceAddr = workspace_allocator.get();
874
+ }
875
+
876
+ ACL_CHECK(
877
+ aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream()));
878
+ }
879
+
880
+ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
881
+ ggml_tensor* src = dst->src[0];
882
+
883
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
884
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
885
+
886
+ ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
887
+ ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
888
+ src->extra = src_extra_allocator.get();
889
+ dst->extra = dst_extra_allocator.get();
890
+ ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
891
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
892
+ ctx.stream()));
893
+ ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
894
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
895
+ ctx.stream()));
896
+
897
+ if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
898
+ ggml_are_same_shape(src, dst)) {
899
+ cann_copy(ctx, acl_src, acl_dst);
900
+ ACL_CHECK(aclDestroyTensor(acl_src));
901
+ ACL_CHECK(aclDestroyTensor(acl_dst));
902
+ return;
903
+ }
904
+ // TODO: simplify
905
+ if (src->type == GGML_TYPE_F16) {
906
+ if (dst->type == GGML_TYPE_Q8_0) {
907
+ aclrtlaunch_ascendc_quantize_f16_q8_0(
908
+ 24, ctx.stream(), src->data, dst->data,
909
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
910
+ ((ggml_tensor*)dst->extra)->ne);
911
+ return;
912
+ }
913
+ if (dst->type == GGML_TYPE_F16) {
914
+ if (ggml_are_same_shape(src, dst)) {
915
+ cann_copy(ctx, acl_src, acl_dst);
916
+ ACL_CHECK(aclDestroyTensor(acl_src));
917
+ ACL_CHECK(aclDestroyTensor(acl_dst));
918
+ return;
919
+ }
920
+ if (ggml_is_contiguous(dst)) {
921
+ const size_t src_type_size = ggml_type_size(src->type);
922
+ if (src->nb[0] == src_type_size) {
923
+ // src0 is contigous on first dimension, copy by rows
924
+ int64_t rows_num = ggml_nrows(src);
925
+
926
+ aclrtlaunch_ascendc_dup_by_rows_fp16(
927
+ rows_num, ctx.stream(), src->data, dst->data,
928
+ ((ggml_tensor*)src->extra)->ne,
929
+ ((ggml_tensor*)src->extra)->nb,
930
+ ((ggml_tensor*)dst->extra)->ne,
931
+ ((ggml_tensor*)dst->extra)->nb);
932
+ return;
933
+ }
934
+ GGML_ABORT("fatal error");
935
+ }
936
+ GGML_ABORT("fatal error");
937
+ }
938
+ if (dst->type == GGML_TYPE_F32) {
939
+ if (ggml_are_same_shape(src, dst)) {
940
+ cann_copy(ctx, acl_src, acl_dst);
941
+ ACL_CHECK(aclDestroyTensor(acl_src));
942
+ ACL_CHECK(aclDestroyTensor(acl_dst));
943
+ return;
944
+ }
945
+ if (ggml_is_contiguous(dst)) {
946
+ const size_t src_type_size = ggml_type_size(src->type);
947
+ if (src->nb[0] == src_type_size) {
948
+ // src0 is contigous on first dimension, copy by rows
949
+ int64_t rows_num = ggml_nrows(src);
950
+ aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
951
+ rows_num, ctx.stream(), src->data, dst->data,
952
+ ((ggml_tensor*)src->extra)->ne,
953
+ ((ggml_tensor*)src->extra)->nb,
954
+ ((ggml_tensor*)dst->extra)->ne,
955
+ ((ggml_tensor*)dst->extra)->nb);
956
+ return;
957
+ }
958
+ GGML_ABORT("fatal error");
959
+ }
960
+ GGML_ABORT("fatal error");
961
+ }
962
+ // TODO
963
+ GGML_ABORT("fatal error");
964
+ } else if (src->type == GGML_TYPE_F32) {
965
+ // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
966
+ // && nb0 == type_size)
967
+ if (dst->type == GGML_TYPE_Q8_0) {
968
+ aclrtlaunch_ascendc_quantize_f32_q8_0(
969
+ 24, ctx.stream(), src->data, dst->data,
970
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
971
+ ((ggml_tensor*)dst->extra)->ne);
972
+ return;
973
+ }
974
+ if (dst->type == GGML_TYPE_F32) {
975
+ if (ggml_are_same_shape(src, dst)) {
976
+ cann_copy(ctx, acl_src, acl_dst);
977
+ ACL_CHECK(aclDestroyTensor(acl_src));
978
+ ACL_CHECK(aclDestroyTensor(acl_dst));
979
+ return;
980
+ }
981
+ if (ggml_is_contiguous(dst)) {
982
+ const size_t src_type_size = ggml_type_size(src->type);
983
+ if (src->nb[0] == src_type_size) {
984
+ // src0 is contigous on first dimension, copy by rows
985
+ int64_t rows_num = ggml_nrows(src);
986
+ aclrtlaunch_ascendc_dup_by_rows_fp32(
987
+ rows_num, ctx.stream(), src->data, dst->data,
988
+ ((ggml_tensor*)src->extra)->ne,
989
+ ((ggml_tensor*)src->extra)->nb,
990
+ ((ggml_tensor*)dst->extra)->ne,
991
+ ((ggml_tensor*)dst->extra)->nb);
992
+ return;
993
+ }
994
+ GGML_ABORT("fatal error");
995
+ } else {
996
+ // TODO: dst not contiguous
997
+ GGML_ABORT("fatal error");
998
+ }
999
+ }
1000
+ if (dst->type == GGML_TYPE_F16) {
1001
+ if (ggml_are_same_shape(src, dst)) {
1002
+ cann_copy(ctx, acl_src, acl_dst);
1003
+ ACL_CHECK(aclDestroyTensor(acl_src));
1004
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1005
+ return;
1006
+ }
1007
+ if (ggml_is_contiguous(dst)) {
1008
+ const size_t src_type_size = ggml_type_size(src->type);
1009
+ if (src->nb[0] == src_type_size) {
1010
+ // src0 is contigous on first dimension, copy by rows
1011
+ int64_t rows_num = ggml_nrows(src);
1012
+ aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
1013
+ rows_num, ctx.stream(), src->data, dst->data,
1014
+ ((ggml_tensor*)src->extra)->ne,
1015
+ ((ggml_tensor*)src->extra)->nb,
1016
+ ((ggml_tensor*)dst->extra)->ne,
1017
+ ((ggml_tensor*)dst->extra)->nb);
1018
+ return;
1019
+ }
1020
+ GGML_ABORT("fatal error");
1021
+ }
1022
+ }
1023
+ // TODO
1024
+ GGML_ABORT("fatal error");
1025
+ } else {
1026
+ if (ggml_are_same_shape(src, dst)) {
1027
+ cann_copy(ctx, acl_src, acl_dst);
1028
+ ACL_CHECK(aclDestroyTensor(acl_src));
1029
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1030
+ return;
1031
+ }
1032
+ GGML_ABORT("fatal error");
1033
+ }
1034
+ }
1035
+
1036
+ #ifdef __cplusplus
1037
+ extern "C" {
1038
+ #endif
1039
+ aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x,
1040
+ const aclTensor* gamma, double epsilon,
1041
+ const aclTensor* yOut,
1042
+ const aclTensor* rstdOout,
1043
+ uint64_t* workspaceSize,
1044
+ aclOpExecutor** executor);
1045
+ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
1046
+ aclOpExecutor* executor, aclrtStream stream);
1047
+ #ifdef __cplusplus
1048
+ }
1049
+ #endif
1050
+
1051
+ /**
1052
+ * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
1053
+ *
1054
+ * This function initializes a tensor with zeros using the specified buffer and
1055
+ * tensor parameters.
1056
+ *
1057
+ * @param ctx The context for the CANN backend operations.
1058
+ * @param buffer The buffer to be used for the tensor data.
1059
+ * @param n_bytes The size of the buffer in bytes.
1060
+ * @param ne An array specifying the extents (sizes) of each dimension of the
1061
+ * tensor.
1062
+ * @param dims The number of dimensions of the tensor.
1063
+ * @param type The data type of the tensor.
1064
+ * @param type_size The size of each element in the tensor data type.
1065
+ * @return An ACL tensor initialized with zeros.
1066
+ */
1067
+ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
1068
+ size_t n_bytes, int64_t* ne, int64_t dims,
1069
+ aclDataType type, size_t type_size) {
1070
+ size_t nb[GGML_MAX_DIMS];
1071
+ nb[0] = type_size;
1072
+ for (int i = 1; i < dims; i++) {
1073
+ nb[i] = nb[i - 1] * ne[i - 1];
1074
+ }
1075
+
1076
+ ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
1077
+ aclTensor* zero =
1078
+ ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
1079
+ return zero;
1080
+ }
1081
+
1082
+ /**
1083
+ * @brief Creates an ACL tensor initialized with ones using a provided buffer.
1084
+ *
1085
+ * This function initializes a tensor with ones using the specified buffer and
1086
+ * tensor parameters.
1087
+ *
1088
+ * @param ctx The context for the CANN backend operations.
1089
+ * @param buffer The buffer to be used for the tensor data.
1090
+ * @param n_bytes The size of the buffer in bytes.
1091
+ * @param ne An array specifying the extents (sizes) of each dimension of the
1092
+ * tensor.
1093
+ * @param dims The number of dimensions of the tensor.
1094
+ * @param type The data type of the tensor.
1095
+ * @param type_size The size of each element in the tensor data type.
1096
+ * @param value The value to be used for initializing the tensor (default
1097
+ * is 1.0).
1098
+ * @return An ACL tensor initialized with ones.
1099
+ */
1100
+ static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
1101
+ size_t n_bytes, int64_t* ne, int64_t dims,
1102
+ aclDataType type, size_t type_size,
1103
+ float value = 1.0f) {
1104
+ aclTensor* acl_tensor =
1105
+ aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
1106
+ float alpha_host = 1.0f;
1107
+ aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
1108
+ aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
1109
+
1110
+ uint64_t workspaceSize = 0;
1111
+ aclOpExecutor* executor;
1112
+ void* workspaceAddr = nullptr;
1113
+
1114
+ ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha,
1115
+ &workspaceSize, &executor));
1116
+
1117
+ if (workspaceSize > 0) {
1118
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1119
+ workspaceAddr = workspace_allocator.get();
1120
+ }
1121
+ ACL_CHECK(
1122
+ aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream()));
1123
+
1124
+ return acl_tensor;
1125
+ }
1126
+
1127
+ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1128
+ ggml_tensor* src = dst->src[0];
1129
+
1130
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
1131
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1132
+
1133
+ float eps;
1134
+ memcpy(&eps, dst->op_params, sizeof(float));
1135
+
1136
+ GGML_ASSERT(eps > 0.0f);
1137
+
1138
+ uint64_t workspaceSize = 0;
1139
+ aclOpExecutor* executor;
1140
+ void* workspaceAddr = nullptr;
1141
+
1142
+ size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
1143
+ ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
1144
+
1145
+ aclTensor* acl_gamma = aclnn_ones(
1146
+ ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
1147
+ ggml_cann_type_mapping(src->type), ggml_element_size(src));
1148
+
1149
+ size_t zero_tensor_n_bytes =
1150
+ src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
1151
+ ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
1152
+ aclTensor* acl_rstd =
1153
+ aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
1154
+ src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
1155
+ ggml_element_size(src));
1156
+
1157
+ ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
1158
+ acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
1159
+
1160
+ if (workspaceSize > 0) {
1161
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1162
+ workspaceAddr = workspace_allocator.get();
1163
+ }
1164
+
1165
+ ACL_CHECK(
1166
+ aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
1167
+
1168
+ ACL_CHECK(aclDestroyTensor(acl_src));
1169
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1170
+ ACL_CHECK(aclDestroyTensor(acl_gamma));
1171
+ ACL_CHECK(aclDestroyTensor(acl_rstd));
1172
+ }
1173
+
1174
+ // TODO: performace is low.
1175
+ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
1176
+ float value) {
1177
+ ggml_tensor* src = dst->src[0];
1178
+
1179
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
1180
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1181
+
1182
+ const int n_past = ((int32_t*)dst->op_params)[0];
1183
+
1184
+ size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
1185
+ src->ne[3] * ggml_element_size(src);
1186
+ ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
1187
+
1188
+ aclTensor* mask_tensor =
1189
+ aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
1190
+ GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
1191
+ ggml_element_size(src), value);
1192
+
1193
+ uint64_t workspaceSize = 0;
1194
+ aclOpExecutor* executor;
1195
+ void* workspaceAddr = nullptr;
1196
+
1197
+ ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
1198
+ &workspaceSize, &executor));
1199
+ if (workspaceSize > 0) {
1200
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1201
+ workspaceAddr = workspace_allocator.get();
1202
+ }
1203
+
1204
+ ACL_CHECK(
1205
+ aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
1206
+
1207
+ ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
1208
+ &workspaceSize, &executor));
1209
+ if (workspaceSize > 0) {
1210
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1211
+ workspaceAddr = workspace_allocator.get();
1212
+ }
1213
+
1214
+ ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream()));
1215
+
1216
+ aclScalar* alpha = nullptr;
1217
+ float alphaValue = 1.0f;
1218
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
1219
+
1220
+ ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha,
1221
+ &workspaceSize, &executor));
1222
+ if (workspaceSize > 0) {
1223
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1224
+ workspaceAddr = workspace_allocator.get();
1225
+ }
1226
+ ACL_CHECK(
1227
+ aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
1228
+
1229
+ ACL_CHECK(aclDestroyScalar(alpha));
1230
+ ACL_CHECK(aclDestroyTensor(mask_tensor));
1231
+ ACL_CHECK(aclDestroyTensor(acl_src));
1232
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1233
+ }
1234
+
1235
+ /**
1236
+ * @brief Casts the data type of a source tensor to a destination tensor.
1237
+ *
1238
+ * This function casts the data type of the source tensor `acl_src` to the
1239
+ * specified data type `cast_data_type` and stores the result in the destination
1240
+ * tensor `acl_dst`.
1241
+ *
1242
+ * @param ctx The context for the CANN backend operations.
1243
+ * @param acl_src The source tensor whose data type will be casted.
1244
+ * @param acl_dst The destination tensor where the casted result will be stored.
1245
+ * @param cast_data_type The target data type to which the source tensor will be
1246
+ * casted.
1247
+ */
1248
+ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1249
+ aclTensor* acl_dst, aclDataType cast_data_type) {
1250
+ uint64_t workspaceSize = 0;
1251
+ aclOpExecutor* executor;
1252
+ void* workspaceAddr = nullptr;
1253
+
1254
+ ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
1255
+ &workspaceSize, &executor));
1256
+ if (workspaceSize > 0) {
1257
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1258
+ workspaceAddr = workspace_allocator.get();
1259
+ }
1260
+
1261
+ ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
1262
+ }
1263
+
1264
+ /**
1265
+ * @brief Permutes the dimensions of a tensor according to a specified order.
1266
+ *
1267
+ * This function permutes the dimensions of the source tensor `acl_src`
1268
+ * according to the order specified in the `new_dim` array and stores the result
1269
+ * in the destination tensor `acl_dst`.
1270
+ *
1271
+ * @param ctx The context for the CANN backend operations.
1272
+ * @param acl_src The source tensor whose dimensions will be permuted.
1273
+ * @param acl_dst The destination tensor where the permuted result will be
1274
+ * stored.
1275
+ * @param new_dim An array specifying the new order of dimensions for the
1276
+ * tensor.
1277
+ * @param dims The number of dimensions in the tensor.
1278
+ */
1279
+ static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1280
+ aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
1281
+ aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
1282
+
1283
+ uint64_t workspaceSize = 0;
1284
+ aclOpExecutor* executor;
1285
+ void* workspaceAddr = nullptr;
1286
+
1287
+ ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
1288
+ &workspaceSize, &executor));
1289
+ if (workspaceSize > 0) {
1290
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1291
+ workspaceAddr = workspace_allocator.get();
1292
+ }
1293
+
1294
+ ACL_CHECK(
1295
+ aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
1296
+
1297
+ ACL_CHECK(aclDestroyIntArray(acl_dims));
1298
+ }
1299
+
1300
+ #ifdef __cplusplus
1301
+ extern "C" {
1302
+ #endif
1303
+ aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
1304
+ const aclIntArray* kernelSize,
1305
+ const aclIntArray* dilation,
1306
+ const aclIntArray* padding,
1307
+ const aclIntArray* stride,
1308
+ aclTensor* out, uint64_t* workspaceSize,
1309
+ aclOpExecutor** executor);
1310
+ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
1311
+ aclOpExecutor* executor, aclrtStream stream);
1312
+ #ifdef __cplusplus
1313
+ }
1314
+ #endif
1315
+ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1316
+ ggml_tensor* src0 = dst->src[0]; // kernel
1317
+ ggml_tensor* src1 = dst->src[1]; // input
1318
+
1319
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
1320
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1321
+ GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1322
+
1323
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
1324
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
1325
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
1326
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
1327
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
1328
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
1329
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
1330
+
1331
+ GGML_TENSOR_BINARY_OP_LOCALS;
1332
+
1333
+ const int64_t N = is_2D ? ne13 : ne12;
1334
+ const int64_t IC = is_2D ? ne12 : ne11;
1335
+
1336
+ const int64_t KH = is_2D ? ne01 : 1;
1337
+ const int64_t KW = ne00;
1338
+
1339
+ const int64_t OH = is_2D ? ne2 : 1;
1340
+ const int64_t OW = ne1;
1341
+
1342
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
1343
+ GGML_ASSERT(nb10 == sizeof(float));
1344
+
1345
+ // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
1346
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1347
+ int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
1348
+ size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1349
+
1350
+ tmp_im2col_nb[0] = ggml_type_size(src1->type);
1351
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1352
+ tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
1353
+ }
1354
+
1355
+ // Calculate im2col.
1356
+ // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1357
+ // dst.elemcount.
1358
+ ggml_cann_pool_alloc im2col_allocator(
1359
+ ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
1360
+ void* tmp_im2col_buffer = im2col_allocator.get();
1361
+ aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
1362
+ tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
1363
+ ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
1364
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1365
+
1366
+ std::vector<int64_t> kernel_dims = {KH, KW};
1367
+ std::vector<int64_t> dilation_size = {d1, d0};
1368
+ std::vector<int64_t> padding_dims = {p1, p0};
1369
+ std::vector<int64_t> stride_dims = {s1, s0};
1370
+ auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1371
+ auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1372
+ auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1373
+ auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1374
+
1375
+ uint64_t workspaceSize = 0;
1376
+ aclOpExecutor* executor;
1377
+ void* workspaceAddr = nullptr;
1378
+
1379
+ ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
1380
+ paddings, strides, tmp_im2col_tensor,
1381
+ &workspaceSize, &executor));
1382
+
1383
+ if (workspaceSize > 0) {
1384
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1385
+ workspaceAddr = workspace_allocator.get();
1386
+ }
1387
+
1388
+ ACL_CHECK(
1389
+ aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream()));
1390
+
1391
+ // Cast if dst is f16.
1392
+ aclTensor* tmp_cast_tensor = nullptr;
1393
+ ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
1394
+ if (src1->type != dst->type) {
1395
+ tmp_cast_allocator.alloc(ggml_nbytes(dst));
1396
+ void* tmp_cast_buffer = tmp_cast_allocator.get();
1397
+ size_t temp_cast_nb[GGML_MAX_DIMS - 1];
1398
+ temp_cast_nb[0] = ggml_type_size(dst->type);
1399
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1400
+ temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
1401
+ }
1402
+
1403
+ tmp_cast_tensor = ggml_cann_create_tensor(
1404
+ tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
1405
+ ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
1406
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1407
+ aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
1408
+ ggml_cann_type_mapping(dst->type));
1409
+ }
1410
+
1411
+ // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1412
+ int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
1413
+ size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1414
+ aclTensor* acl_dst =
1415
+ ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1416
+
1417
+ int64_t permute_dim[] = {0, 2, 1};
1418
+ if (src1->type != dst->type) {
1419
+ aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
1420
+ } else {
1421
+ aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
1422
+ }
1423
+
1424
+ // release
1425
+ ACL_CHECK(aclDestroyTensor(acl_src1));
1426
+ ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
1427
+ ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
1428
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1429
+ ACL_CHECK(aclDestroyIntArray(kernel_size));
1430
+ ACL_CHECK(aclDestroyIntArray(dilations));
1431
+ ACL_CHECK(aclDestroyIntArray(paddings));
1432
+ ACL_CHECK(aclDestroyIntArray(strides));
1433
+ }
1434
+
1435
+ /**
1436
+ * @brief Applies element-wise exponential function to the elements of a tensor.
1437
+ *
1438
+ * This function computes the exponential of each element in the source tensor
1439
+ * `acl_src` and stores the result back into the same tensor.
1440
+ * The operation is defined as:
1441
+ * \f[
1442
+ * \text {acl_src }_i=e^{acl\_src_i}
1443
+ * \f]
1444
+ *
1445
+ * @param ctx The context for the CANN backend operations.
1446
+ * @param acl_src The tensor on which the exponential function will be applied.
1447
+ */
1448
+ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1449
+ uint64_t workspaceSize = 0;
1450
+ aclOpExecutor* executor;
1451
+ void* workspaceAddr = nullptr;
1452
+
1453
+ ACL_CHECK(
1454
+ aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor));
1455
+ if (workspaceSize > 0) {
1456
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1457
+ workspaceAddr = workspace_allocator.get();
1458
+ }
1459
+
1460
+ ACL_CHECK(
1461
+ aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
1462
+ }
1463
+
1464
+ /**
1465
+ * @brief Multiplies elements of a tensor by a scalar value, optionally
1466
+ * in-place.
1467
+ *
1468
+ * This function multiplies each element of the source tensor `acl_src` by the
1469
+ * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
1470
+ * `inplace` is true, `acl_dst` will not be used and the operation is performed
1471
+ * in-place on `acl_src`.
1472
+ * The operation is defined as:
1473
+ * \f[
1474
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
1475
+ * \f]
1476
+ *
1477
+ * @param ctx The context for the CANN backend operations.
1478
+ * @param acl_src The source tensor whose elements will be multiplied.
1479
+ * @param scale The scalar value by which each element of `acl_src` will be
1480
+ * multiplied.
1481
+ * @param acl_dst The destination tensor where the result will be stored if
1482
+ * `inplace` is false.
1483
+ * @param inplace Flag indicating whether to perform the operation in-place on
1484
+ * `acl_src`.
1485
+ */
1486
+ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1487
+ float scale, aclTensor* acl_dst, bool inplace) {
1488
+ aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
1489
+
1490
+ uint64_t workspaceSize = 0;
1491
+ aclOpExecutor* executor;
1492
+ void* workspaceAddr = nullptr;
1493
+
1494
+ if (inplace) {
1495
+ ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
1496
+ &workspaceSize, &executor));
1497
+ if (workspaceSize > 0) {
1498
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1499
+ workspaceAddr = workspace_allocator.get();
1500
+ }
1501
+
1502
+ ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
1503
+ ctx.stream()));
1504
+ } else {
1505
+ ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
1506
+ &workspaceSize, &executor));
1507
+ if (workspaceSize > 0) {
1508
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1509
+ workspaceAddr = workspace_allocator.get();
1510
+ }
1511
+
1512
+ ACL_CHECK(
1513
+ aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
1514
+ }
1515
+
1516
+ ACL_CHECK(aclDestroyScalar(acl_scale));
1517
+ }
1518
+
1519
+ /**
1520
+ * @brief Performs an in-place element-wise multiplication of two tensors.
1521
+ *
1522
+ * This function performs an element-wise multiplication of the tensors
1523
+ * `acl_src` and `acl_other` and stores the result in `acl_src`.
1524
+ * The operation is defined as:
1525
+ * \f[
1526
+ * \text {acl_src }_i=\text {acl_src }_i \times \text {acl_other }_i
1527
+ * \f]
1528
+ *
1529
+ * @param ctx The context for the CANN backend operations.
1530
+ * @param acl_src The source tensor where the multiplication result will be
1531
+ * stored.
1532
+ * @param acl_other The tensor whose elements will be multiplied with `acl_src`.
1533
+ */
1534
+ static void aclnn_inplace_mul(ggml_backend_cann_context& ctx,
1535
+ aclTensor* acl_src, aclTensor* acl_other) {
1536
+ uint64_t workspaceSize = 0;
1537
+ aclOpExecutor* executor;
1538
+ void* workspaceAddr = nullptr;
1539
+
1540
+ ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
1541
+ &workspaceSize, &executor));
1542
+ if (workspaceSize > 0) {
1543
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1544
+ workspaceAddr = workspace_allocator.get();
1545
+ }
1546
+
1547
+ ACL_CHECK(
1548
+ aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1549
+ }
1550
+
1551
+ /**
1552
+ * @brief Performs element-wise multiplication of two tensors and stores the
1553
+ * result in a destination tensor.
1554
+ *
1555
+ * This function performs element-wise multiplication of the tensors `acl_src`
1556
+ * and `acl_other` and stores the result in the destination tensor `acl_dst`.
1557
+ * The operation is defined as:
1558
+ * \f[
1559
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
1560
+ * \f]
1561
+ *
1562
+ * @param ctx The context for the CANN backend operations.
1563
+ * @param acl_src The first tensor for element-wise multiplication.
1564
+ * @param acl_other The second tensor for element-wise multiplication.
1565
+ * @param acl_dst The destination tensor where the result will be stored.
1566
+ */
1567
+ static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1568
+ aclTensor* acl_other, aclTensor* acl_dst) {
1569
+ uint64_t workspaceSize = 0;
1570
+ aclOpExecutor* executor;
1571
+ void* workspaceAddr = nullptr;
1572
+
1573
+ ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
1574
+ &workspaceSize, &executor));
1575
+ if (workspaceSize > 0) {
1576
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1577
+ workspaceAddr = workspace_allocator.get();
1578
+ }
1579
+
1580
+ ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1581
+ }
1582
+
1583
+ /**
1584
+ * @brief Applies element-wise cosine function to the elements of a tensor.
1585
+ *
1586
+ * This function computes the cosine of each element in the source tensor
1587
+ * `acl_src` and stores the result in the destination tensor `acl_dst`. The
1588
+ * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
1589
+ * }_i\right) \f]
1590
+ *
1591
+ * @param ctx The context for the CANN backend operations.
1592
+ * @param acl_src The source tensor on which the cosine function will be
1593
+ * applied.
1594
+ * @param acl_dst The destination tensor where the cosine results will be
1595
+ * stored.
1596
+ */
1597
+ static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1598
+ aclTensor* acl_dst) {
1599
+ uint64_t workspaceSize = 0;
1600
+ aclOpExecutor* executor;
1601
+ void* workspaceAddr = nullptr;
1602
+
1603
+ ACL_CHECK(
1604
+ aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
1605
+ if (workspaceSize > 0) {
1606
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1607
+ workspaceAddr = workspace_allocator.get();
1608
+ }
1609
+
1610
+ ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
1611
+ }
1612
+
1613
+ /**
1614
+ * @brief Applies element-wise sine function to the elements of a tensor.
1615
+ *
1616
+ * This function computes the sine of each element in the source tensor
1617
+ `acl_src`
1618
+ * and stores the result in the destination tensor `acl_dst`.
1619
+ * The operation is defined as:
1620
+ * \f[
1621
+ * \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
1622
+ * \f]
1623
+
1624
+ * @param ctx The context for the CANN backend operations.
1625
+ * @param acl_src The source tensor on which the sine function will be applied.
1626
+ * @param acl_dst The destination tensor where the sine results will be stored.
1627
+ */
1628
+ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1629
+ aclTensor* acl_dst) {
1630
+ uint64_t workspaceSize = 0;
1631
+ aclOpExecutor* executor;
1632
+ void* workspaceAddr = nullptr;
1633
+
1634
+ ACL_CHECK(
1635
+ aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
1636
+ if (workspaceSize > 0) {
1637
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1638
+ workspaceAddr = workspace_allocator.get();
1639
+ }
1640
+
1641
+ ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
1642
+ }
1643
+
1644
+ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1645
+ ggml_tensor* dst) {
1646
+ const ggml_tensor* src = dst->src[0];
1647
+
1648
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
1649
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1650
+
1651
+ const int dim = dst->op_params[0];
1652
+ const int max_period = dst->op_params[1];
1653
+ int half = dim / 2;
1654
+
1655
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
1656
+
1657
+ // arange: [0, ..., half)
1658
+ float start = 0;
1659
+ float stop = half;
1660
+ float step = 1;
1661
+ int64_t n_elements_arange = half;
1662
+ int64_t tmp_arange_ne[] = {half};
1663
+ size_t tmp_arange_nb[] = {sizeof(dst->type)};
1664
+
1665
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
1666
+ void* tmp_arange_buffer = arange_allocator.get();
1667
+ aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
1668
+ tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1669
+ ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
1670
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1671
+
1672
+ aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
1673
+
1674
+ // freq
1675
+ float freq_param = -logf(max_period) / half;
1676
+ bool inplace = true;
1677
+ aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
1678
+ aclnn_exp(ctx, tmp_arange_tensor);
1679
+
1680
+ // permute: src [0,1,2,3]->[0,1,3,2]
1681
+ int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
1682
+ size_t tmp_permute_nb[GGML_MAX_DIMS];
1683
+ tmp_permute_nb[0] = ggml_type_size(src->type);
1684
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1685
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1686
+ }
1687
+
1688
+ ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1689
+ void* tmp_permute_buffer = permute_allocator.get();
1690
+ aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor(
1691
+ tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1692
+ ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1693
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
1694
+ int64_t permute_dim[] = {0, 1, 3, 2};
1695
+ int64_t num_dims = 4;
1696
+ aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims);
1697
+
1698
+ // timestep * freq
1699
+ int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
1700
+ src->ne[3]};
1701
+ size_t tmp_mul_nb[GGML_MAX_DIMS];
1702
+ tmp_mul_nb[0] = ggml_type_size(src->type);
1703
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1704
+ tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
1705
+ }
1706
+
1707
+ int mul_nelements =
1708
+ src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
1709
+
1710
+ ggml_cann_pool_alloc mul_allocator(
1711
+ ctx.pool(), mul_nelements * ggml_type_size(src->type));
1712
+ void* tmp_mul_buffer = mul_allocator.get();
1713
+ aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
1714
+ tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1715
+ ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1716
+ ACL_FORMAT_ND);
1717
+ aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor);
1718
+
1719
+ // cos
1720
+ ggml_cann_pool_alloc cos_allocator(
1721
+ ctx.pool(), mul_nelements * ggml_type_size(src->type));
1722
+ void* tmp_cos_buffer = cos_allocator.get();
1723
+ aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
1724
+ tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
1725
+ ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1726
+ ACL_FORMAT_ND);
1727
+
1728
+ aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
1729
+
1730
+ // sin
1731
+ ggml_cann_pool_alloc sin_allocator(
1732
+ ctx.pool(), mul_nelements * ggml_type_size(src->type));
1733
+ void* tmp_sin_buffer = sin_allocator.get();
1734
+ aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
1735
+ tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
1736
+ ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1737
+ ACL_FORMAT_ND);
1738
+
1739
+ aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
1740
+
1741
+ // concat
1742
+ int64_t concat_dim = 3;
1743
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1744
+ aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1745
+ aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
1746
+ aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
1747
+
1748
+ // release
1749
+ // segmentation fault when delete both tensorList and his elements.
1750
+ ACL_CHECK(aclDestroyTensorList(tensorList));
1751
+ ACL_CHECK(aclDestroyTensor(acl_src));
1752
+ ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1753
+ ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
1754
+ ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
1755
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1756
+ }
1757
+
1758
+ /**
1759
+ * @brief Fills a tensor with a scalar value.
1760
+ *
1761
+ * This function fills the destination tensor `acl_dst` with the scalar value
1762
+ * `scalar`.
1763
+ *
1764
+ * @param ctx The context for the CANN backend operations.
1765
+ * @param scalar The scalar value used to fill the tensor.
1766
+ * @param acl_dst The destination tensor to be filled with the scalar value.
1767
+ */
1768
+ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1769
+ aclTensor* acl_dst) {
1770
+ auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1771
+
1772
+ uint64_t workspaceSize = 0;
1773
+ aclOpExecutor* executor;
1774
+ void* workspaceAddr = nullptr;
1775
+
1776
+ ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(
1777
+ acl_dst, acl_scalar, &workspaceSize, &executor));
1778
+ if (workspaceSize > 0) {
1779
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1780
+ workspaceAddr = workspace_allocator.get();
1781
+ }
1782
+
1783
+ ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
1784
+ ctx.stream()));
1785
+ ACL_CHECK(aclDestroyScalar(acl_scalar));
1786
+ }
1787
+
1788
+ /**
1789
+ * @brief Raises each element of a tensor to the power of the corresponding
1790
+ * element in another tensor.
1791
+ *
1792
+ * This function computes the element-wise power of the destination tensor
1793
+ * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
1794
+ * The operation is defined as:
1795
+ * \f[
1796
+ * \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
1797
+ * \f]
1798
+ *
1799
+ * @param ctx The context for the CANN backend operations.
1800
+ * @param acl_dst The destination tensor, which also serves as the base tensor.
1801
+ * @param acl_exp The exponent tensor, each element of which is used to raise
1802
+ * the corresponding element in the destination tensor.
1803
+ */
1804
+ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1805
+ aclTensor* acl_dst, aclTensor* acl_exp) {
1806
+ uint64_t workspaceSize = 0;
1807
+ aclOpExecutor* executor;
1808
+ void* workspaceAddr = nullptr;
1809
+
1810
+ ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(
1811
+ acl_dst, acl_exp, &workspaceSize, &executor));
1812
+ if (workspaceSize > 0) {
1813
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1814
+ workspaceAddr = workspace_allocator.get();
1815
+ }
1816
+
1817
+ ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
1818
+ executor, ctx.stream()));
1819
+ }
1820
+
1821
+ /**
1822
+ * @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
1823
+ * @details This function implements the Alibi mechanism, which introduces
1824
+ * learnable biases into the attention scores to simulate relative
1825
+ * position encoding without the need for explicit positional
1826
+ * embeddings.
1827
+ *
1828
+ * @param ctx The backend CANN context for executing operations.
1829
+ * @param acl_src The source tensor representing the query or key.
1830
+ * @param acl_position The position tensor containing relative positions.
1831
+ * @param acl_dst The destination tensor where the result will be stored.
1832
+ * @param n_head The number of attention heads.
1833
+ * @param src_ne The dimensions of the source tensor.
1834
+ * @param src_nb0 The byte size of the first dimension of the source
1835
+ tensor.
1836
+ * @param max_bias The maximum bias value used in the Alibi mechanism.
1837
+ * @param dst The destination tensor object for additional metadata.
1838
+ *
1839
+ * The function performs the following steps:
1840
+ * 1. Calculates the logarithm floor of the number of heads to determine the
1841
+ base for bias calculation.
1842
+ * 2. Initializes arrays with arithmetic sequences and fills them with bias
1843
+ values.
1844
+ * 3. Computes the bias tensor based on the calculated biases and arithmetic
1845
+ sequences.
1846
+ * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
1847
+ * 5. Multiplies the position tensor by the bias tensor.
1848
+ * 6. Adds the result of the multiplication to the source tensor to produce the
1849
+ final output.
1850
+ */
1851
+ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1852
+ aclTensor* acl_position, aclTensor* acl_dst,
1853
+ const int n_head, int64_t* src_ne, const size_t src_nb0,
1854
+ float max_bias, ggml_tensor* dst) {
1855
+ const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
1856
+ GGML_ASSERT(src_nb0 == sizeof(float));
1857
+ GGML_ASSERT(n_head == src_ne[2]);
1858
+
1859
+ const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
1860
+
1861
+ float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
1862
+ float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
1863
+
1864
+ // init arange
1865
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(),
1866
+ ne2_ne3 * ggml_type_size(dst->type));
1867
+ void* tmp_arange_buffer = arange_allocator.get();
1868
+
1869
+ // arange1: [1, ..., n_heads_log2_floor+1)
1870
+ float start = 1;
1871
+ float stop = n_heads_log2_floor + 1;
1872
+ float step = 1;
1873
+ int64_t n_elements_arange = n_heads_log2_floor;
1874
+
1875
+ int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
1876
+ size_t tmp_arange1_nb[] = {sizeof(dst->type)};
1877
+ aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
1878
+ tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1879
+ ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
1880
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1881
+
1882
+ aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
1883
+
1884
+ aclTensor* tmp_arange2_tensor = nullptr;
1885
+ if (n_heads_log2_floor < ne2_ne3) {
1886
+ // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
1887
+ start = 1;
1888
+ stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
1889
+ step = 2;
1890
+ n_elements_arange = ne2_ne3 - n_heads_log2_floor;
1891
+ int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1892
+ size_t tmp_arange2_nb[] = {sizeof(dst->type)};
1893
+
1894
+ aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
1895
+ (char*)tmp_arange_buffer +
1896
+ n_heads_log2_floor * ggml_type_size(dst->type),
1897
+ ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1898
+ tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1899
+ aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
1900
+ n_elements_arange);
1901
+ }
1902
+
1903
+ // init mk_base
1904
+ ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
1905
+ ne2_ne3 * ggml_type_size(dst->type));
1906
+ void* tmp_mk_base_buffer = mk_base_allocator.get();
1907
+ int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
1908
+ size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
1909
+ aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
1910
+ tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
1911
+ ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
1912
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1913
+
1914
+ aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
1915
+
1916
+ aclTensor* tmp_mk_base2_tensor = nullptr;
1917
+ if (n_heads_log2_floor < ne2_ne3) {
1918
+ int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
1919
+ size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
1920
+ aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
1921
+ (char*)tmp_mk_base_buffer +
1922
+ n_heads_log2_floor * ggml_type_size(dst->type),
1923
+ ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1924
+ tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1925
+ aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
1926
+ }
1927
+
1928
+ // init mk
1929
+ int64_t tmp_mk_base_ne[] = {ne2_ne3};
1930
+ size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
1931
+ aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
1932
+ tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
1933
+ ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
1934
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1935
+ aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
1936
+ tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1937
+ ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
1938
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1939
+ aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
1940
+
1941
+ // reshape mk
1942
+ int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
1943
+ size_t tmp_mk_nb[GGML_MAX_DIMS];
1944
+ tmp_mk_nb[0] = ggml_type_size(dst->type);
1945
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1946
+ tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
1947
+ }
1948
+ aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
1949
+ tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
1950
+ ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
1951
+ ACL_FORMAT_ND);
1952
+
1953
+ // acl_position * mk
1954
+ int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
1955
+ size_t tmp_output_nb[GGML_MAX_DIMS];
1956
+ tmp_output_nb[0] = ggml_type_size(dst->type);
1957
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1958
+ tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
1959
+ }
1960
+ ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
1961
+ void* tmp_output_buffer = output_allocator.get();
1962
+ aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
1963
+ tmp_output_buffer, ggml_cann_type_mapping(dst->type),
1964
+ ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
1965
+ ACL_FORMAT_ND);
1966
+ aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
1967
+
1968
+ // add
1969
+ aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
1970
+
1971
+ ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
1972
+ ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
1973
+ ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
1974
+ ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
1975
+ ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
1976
+ ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1977
+ ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
1978
+ ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
1979
+ }
1980
+
1981
+ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1982
+ ggml_cann_dup(ctx, dst);
1983
+ }
1984
+
1985
+ /**
1986
+ * @brief Performs element-wise addition of two tensors in place.
1987
+ *
1988
+ * This function adds the source tensor `acl_src` to the destination tensor
1989
+ * `acl_dst` element-wise and stores the result in the destination tensor
1990
+ * `acl_dst`.
1991
+ *
1992
+ * @param ctx The context for the CANN backend operations.
1993
+ * @param acl_src The source tensor to be added.
1994
+ * @param acl_dst The destination tensor which will hold the result of the
1995
+ * addition.
1996
+ */
1997
+ static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
1998
+ aclTensor* acl_src, aclTensor* acl_dst) {
1999
+ aclScalar* alpha = nullptr;
2000
+ float alphaValue = 1.0f;
2001
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2002
+
2003
+ uint64_t workspaceSize = 0;
2004
+ aclOpExecutor* executor;
2005
+ void* workspaceAddr = nullptr;
2006
+
2007
+ ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
2008
+ &workspaceSize, &executor));
2009
+ if (workspaceSize > 0) {
2010
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2011
+ workspaceAddr = workspace_allocator.get();
2012
+ }
2013
+
2014
+ ACL_CHECK(
2015
+ aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
2016
+
2017
+ ACL_CHECK(aclDestroyScalar(alpha));
2018
+ }
2019
+
2020
+ /**
2021
+ * @brief Applies the softmax function to a tensor along a specified dimension.
2022
+ *
2023
+ * This function computes the softmax of the source tensor `acl_src` along the
2024
+ * specified dimension `dim` and stores the result in the destination tensor
2025
+ * `acl_dst`.
2026
+ *
2027
+ * @param ctx The context for the CANN backend operations.
2028
+ * @param acl_src The source tensor on which the softmax function will be
2029
+ * applied.
2030
+ * @param dim The dimension along which the softmax function will be computed.
2031
+ * @param acl_dst The destination tensor where the softmax results will be
2032
+ * stored.
2033
+ */
2034
+ static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2035
+ int64_t dim, aclTensor* acl_dst) {
2036
+ uint64_t workspaceSize = 0;
2037
+ aclOpExecutor* executor;
2038
+ void* workspaceAddr = nullptr;
2039
+
2040
+ ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst,
2041
+ &workspaceSize, &executor));
2042
+
2043
+ if (workspaceSize > 0) {
2044
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2045
+ workspaceAddr = workspace_allocator.get();
2046
+ }
2047
+
2048
+ aclrtStream stream = ctx.stream();
2049
+ ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
2050
+ }
2051
+
2052
+ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2053
+ ggml_tensor* src0 = dst->src[0];
2054
+ ggml_tensor* src1 = dst->src[1]; // mask
2055
+
2056
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
2057
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2058
+
2059
+ float scale = 1.0f;
2060
+ float max_bias = 0.0f;
2061
+
2062
+ memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
2063
+ memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
2064
+
2065
+ // input mul scale
2066
+ aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
2067
+
2068
+ size_t n_bytes = ggml_nbytes(src0);
2069
+ ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
2070
+ void* input_mul_scale_buffer = mul_scale_allocator.get();
2071
+ aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
2072
+ input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
2073
+ src0->nb, GGML_MAX_DIMS);
2074
+
2075
+ bool inplace = false;
2076
+ aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
2077
+
2078
+ // mask
2079
+ aclTensor* acl_src1_fp32_tensor = nullptr;
2080
+ aclTensor* tmp_mask_tensor = nullptr;
2081
+ ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
2082
+ if (src1) {
2083
+ const bool use_f16 = src1->type == GGML_TYPE_F16;
2084
+ if (use_f16) {
2085
+ // cast to fp32
2086
+ size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
2087
+ size_t src1_fp32_nb[GGML_MAX_DIMS];
2088
+ src1_fp32_nb[0] = sizeof(float_t);
2089
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2090
+ src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
2091
+ }
2092
+ src1_fp32_allocator.alloc(n_bytes);
2093
+ void* src1_fp32_buffer = src1_fp32_allocator.get();
2094
+ acl_src1_fp32_tensor = ggml_cann_create_tensor(
2095
+ src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
2096
+ src1_fp32_nb, GGML_MAX_DIMS);
2097
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
2098
+ aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
2099
+
2100
+ ACL_CHECK(aclDestroyTensor(acl_src1));
2101
+ } else {
2102
+ acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
2103
+ }
2104
+
2105
+ // broadcast the mask across rows, only use ne11 of ne01 in mask
2106
+ if (src1->ne[1] != src0->ne[1]) {
2107
+ // mask shape: [1,1,ne11,ne10]
2108
+ int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
2109
+ size_t tmp_mask_nb[GGML_MAX_DIMS];
2110
+ tmp_mask_nb[0] = sizeof(float_t);
2111
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2112
+ tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
2113
+ }
2114
+ tmp_mask_tensor = ggml_cann_create_tensor(
2115
+ src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
2116
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2117
+ }
2118
+
2119
+ // alibi
2120
+ const int n_head = src0->ne[2];
2121
+ const size_t src_nb0 = src0->nb[0];
2122
+
2123
+ n_bytes = ggml_nbytes(dst);
2124
+ ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
2125
+ void* output_buffer = output_allocator.get();
2126
+ aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
2127
+ output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
2128
+ dst->nb, GGML_MAX_DIMS);
2129
+ if (max_bias <= 0.0f) {
2130
+ // slope = 1.0
2131
+ if (tmp_mask_tensor) {
2132
+ aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
2133
+ alibi_output_tensor);
2134
+ } else {
2135
+ aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
2136
+ alibi_output_tensor);
2137
+ }
2138
+ } else {
2139
+ // slope != 1.0
2140
+ if (tmp_mask_tensor) {
2141
+ aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
2142
+ alibi_output_tensor, n_head, src0->ne, src_nb0,
2143
+ max_bias, dst);
2144
+ } else {
2145
+ aclnn_alibi(ctx, acl_input_mul_scale_tensor,
2146
+ acl_src1_fp32_tensor, alibi_output_tensor, n_head,
2147
+ src0->ne, src_nb0, max_bias, dst);
2148
+ }
2149
+ }
2150
+
2151
+ // softmax
2152
+ aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
2153
+ ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
2154
+ } else {
2155
+ aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
2156
+ }
2157
+
2158
+ ACL_CHECK(aclDestroyTensor(acl_src0));
2159
+ ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
2160
+ ACL_CHECK(aclDestroyTensor(acl_dst));
2161
+ ACL_CHECK(aclDestroyScalar(acl_scale));
2162
+ ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
2163
+ ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
2164
+ }
2165
+
2166
+ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2167
+ ggml_tensor* src0 = dst->src[0];
2168
+ ggml_tensor* src1 = dst->src[1];
2169
+
2170
+ ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2171
+ ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2172
+ ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2173
+ src0->extra = src0_extra_allocator.get();
2174
+ src1->extra = src1_extra_allocator.get();
2175
+ dst->extra = dst_extra_allocator.get();
2176
+ ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
2177
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2178
+ ctx.stream()));
2179
+ ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
2180
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2181
+ ctx.stream()));
2182
+ ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
2183
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2184
+ ctx.stream()));
2185
+
2186
+ switch (src0->type) {
2187
+ case GGML_TYPE_F32:
2188
+ aclrtlaunch_ascendc_get_row_f32(
2189
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2190
+ ((ggml_tensor*)src0->extra)->ne,
2191
+ ((ggml_tensor*)src0->extra)->nb,
2192
+ ((ggml_tensor*)src1->extra)->ne,
2193
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2194
+ ((ggml_tensor*)dst->extra)->nb);
2195
+ break;
2196
+ case GGML_TYPE_F16:
2197
+ aclrtlaunch_ascendc_get_row_f16(
2198
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2199
+ ((ggml_tensor*)src0->extra)->ne,
2200
+ ((ggml_tensor*)src0->extra)->nb,
2201
+ ((ggml_tensor*)src1->extra)->ne,
2202
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2203
+ ((ggml_tensor*)dst->extra)->nb);
2204
+ break;
2205
+ case GGML_TYPE_Q4_0:
2206
+ aclrtlaunch_ascendc_get_row_q4_0(
2207
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2208
+ ((ggml_tensor*)src0->extra)->ne,
2209
+ ((ggml_tensor*)src1->extra)->ne,
2210
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2211
+ ((ggml_tensor*)dst->extra)->nb);
2212
+ break;
2213
+ case GGML_TYPE_Q8_0:
2214
+ aclrtlaunch_ascendc_get_row_q8_0(
2215
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2216
+ ((ggml_tensor*)src0->extra)->ne,
2217
+ ((ggml_tensor*)src1->extra)->ne,
2218
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2219
+ ((ggml_tensor*)dst->extra)->nb);
2220
+ break;
2221
+ default:
2222
+ GGML_ABORT("fatal error");
2223
+ break;
2224
+ }
2225
+ }
2226
+
2227
+ /**
2228
+ * @brief Repeats elements of a tensor along a specified dimension.
2229
+ *
2230
+ * This function repeats each element of the source tensor `acl_src` a specified
2231
+ * number of times (`repeats`) along the specified dimension `dim` and stores
2232
+ * the result in the destination tensor `acl_dst`.
2233
+ *
2234
+ * @param ctx The context for the CANN backend operations.
2235
+ * @param acl_src The source tensor whose elements will be repeated.
2236
+ * @param acl_dst The destination tensor where the repeated elements will be
2237
+ * stored.
2238
+ * @param dim The dimension along which the elements will be repeated.
2239
+ * @param repeats The number of times each element will be repeated.
2240
+ * @param output_size The size of the output tensor.
2241
+ */
2242
+ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
2243
+ aclTensor* acl_src, aclTensor* acl_dst,
2244
+ int64_t dim, int64_t repeats,
2245
+ int64_t output_size) {
2246
+ uint64_t workspaceSize = 0;
2247
+ aclOpExecutor* executor;
2248
+ void* workspaceAddr = nullptr;
2249
+
2250
+ ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(
2251
+ acl_src, repeats, dim, output_size, acl_dst, &workspaceSize,
2252
+ &executor));
2253
+ if (workspaceSize > 0) {
2254
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2255
+ workspaceAddr = workspace_allocator.get();
2256
+ }
2257
+
2258
+ ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize,
2259
+ executor, ctx.stream()));
2260
+ }
2261
+
2262
+ /**
2263
+ * @brief Performs matrix multiplication of two tensors.
2264
+ *
2265
+ * This function computes the matrix multiplication of the input tensor
2266
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2267
+ * destination tensor `acl_dst`.
2268
+ * The operation is defined as:
2269
+ * \f[
2270
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2271
+ * \f]
2272
+ *
2273
+ * @param ctx The context for the CANN backend operations.
2274
+ * @param acl_input The input tensor for the matrix multiplication.
2275
+ * @param acl_weight The weight tensor for the matrix multiplication.
2276
+ * @param acl_dst The destination tensor where the result of the matrix
2277
+ * multiplication will be stored.
2278
+ */
2279
+ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2280
+ aclTensor* acl_weight, aclTensor* acl_dst) {
2281
+ int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
2282
+ // fp32, atlas a2 will transpose it to HFLOAT32.
2283
+
2284
+ uint64_t workspaceSize = 0;
2285
+ aclOpExecutor* executor;
2286
+ void* workspaceAddr = nullptr;
2287
+
2288
+ ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2289
+ cube_math_type, &workspaceSize,
2290
+ &executor));
2291
+
2292
+ if (workspaceSize > 0) {
2293
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2294
+ workspaceAddr = workspace_allocator.get();
2295
+ }
2296
+
2297
+ ACL_CHECK(
2298
+ aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2299
+ }
2300
+
2301
+ /**
2302
+ * @brief Performs matrix multiplication with floating-point precision on
2303
+ * tensors using the CANN backend.
2304
+ *
2305
+ * This function performs matrix multiplication of the input tensor and the
2306
+ * weight tensor, handling broadcasting and transposing as needed, and stores
2307
+ * the result in the destination tensor `dst`.
2308
+ *
2309
+ * @param ctx The context for the CANN backend operations.
2310
+ * @param dst The destination tensor where the result of the matrix
2311
+ * multiplication will be stored.
2312
+ */
2313
+ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
2314
+ ggml_tensor* dst) {
2315
+ ggml_tensor* weight = dst->src[0]; // weight
2316
+ ggml_tensor* input = dst->src[1]; // input
2317
+
2318
+ // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
2319
+ // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
2320
+ BCAST_MUL_MAT_SHAPE(input, weight, dst);
2321
+
2322
+ // transpose weight: [1,2,3,4] -> [1,2,4,3]
2323
+ int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
2324
+ bcast_weight_ne[2], bcast_weight_ne[3],
2325
+ bcast_weight_ne[4], bcast_weight_ne[5]};
2326
+ size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
2327
+ bcast_weight_nb[2], bcast_weight_nb[3],
2328
+ bcast_weight_nb[4], bcast_weight_nb[5]};
2329
+
2330
+ aclTensor* acl_weight_tensor =
2331
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
2332
+ aclTensor* acl_input_tensor =
2333
+ ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
2334
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
2335
+ aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2336
+
2337
+ ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2338
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2339
+ ACL_CHECK(aclDestroyTensor(acl_dst));
2340
+ }
2341
+
2342
+ /**
2343
+ * @brief Performs matrix multiplication with quantized weights and
2344
+ * floating-point inputs using the CANN backend.
2345
+ *
2346
+ * This function performs matrix multiplication of the input tensor `src1` and
2347
+ * the weight tensor `src0`, handling broadcasting, transposing, and
2348
+ * quantization as needed, and stores the result in the destination tensor
2349
+ * `dst`.
2350
+ *
2351
+ * @param ctx The context for the CANN backend operations.
2352
+ * @param dst The destination tensor where the result of the matrix
2353
+ * multiplication will be stored.
2354
+ */
2355
+ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
2356
+ ggml_tensor* dst) {
2357
+ ggml_tensor* src0 = dst->src[0]; // weight
2358
+ ggml_tensor* src1 = dst->src[1]; // input
2359
+
2360
+ // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
2361
+ // is regarded as batch. weight need transpose.
2362
+ int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
2363
+ size_t weight_elem_size = sizeof(uint8_t);
2364
+ size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2365
+ // size of one matrix is element_size * height * width.
2366
+ size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
2367
+ size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2368
+
2369
+ // scale stored at the end of weight. Also need transpose.
2370
+ int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
2371
+ size_t scale_elem_size = sizeof(uint16_t);
2372
+ size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
2373
+ scale_elem_size};
2374
+ size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
2375
+ char* scale_offset = (char*)src0->data + weight_size;
2376
+
2377
+ // input
2378
+ void* input_buffer;
2379
+ size_t input_elem_size = sizeof(uint16_t);
2380
+ int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
2381
+ size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
2382
+ size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
2383
+
2384
+ if (src1->type != GGML_TYPE_F16) {
2385
+ aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
2386
+ ggml_cann_pool_alloc input_alloctor(
2387
+ ctx.pool(), ggml_nelements(src1) * input_elem_size);
2388
+ input_buffer = input_alloctor.get();
2389
+
2390
+ int64_t* input_cast_ne = src1->ne;
2391
+ size_t input_cast_nb[GGML_MAX_DIMS];
2392
+ input_cast_nb[0] = sizeof(uint16_t);
2393
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2394
+ input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
2395
+ }
2396
+
2397
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2398
+ input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
2399
+ input_cast_nb, GGML_MAX_DIMS);
2400
+ aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
2401
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2402
+ ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
2403
+ } else {
2404
+ input_buffer = src1->data;
2405
+ }
2406
+
2407
+ // output
2408
+ size_t output_elem_size = sizeof(uint16_t);
2409
+ int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
2410
+ size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
2411
+ ggml_cann_pool_alloc output_alloctor(
2412
+ ctx.pool(), ggml_nelements(dst) * output_elem_size);
2413
+ void* output_buffer = output_alloctor.get();
2414
+ size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
2415
+
2416
+ // aclnn
2417
+ uint64_t workspaceSize = 0;
2418
+ aclOpExecutor* executor;
2419
+ void* workspaceAddr = nullptr;
2420
+
2421
+ for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2422
+ for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
2423
+ int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
2424
+ int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
2425
+
2426
+ int64_t batch1 = n1 * src1->ne[2] + c1;
2427
+ int64_t batch0 = n0 * src0->ne[2] + c0;
2428
+
2429
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2430
+ (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2431
+ input_elem_size, input_ne, input_nb, 2);
2432
+ aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2433
+ (char*)src0->data + batch0 * weight_stride, ACL_INT8,
2434
+ weight_elem_size, weight_ne, weight_nb, 2);
2435
+ aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2436
+ scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2437
+ scale_elem_size, scale_ne, scale_nb, 2);
2438
+ aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2439
+ (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2440
+ output_elem_size, output_ne, output_nb, 2);
2441
+
2442
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2443
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2444
+ nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
2445
+ &workspaceSize, &executor));
2446
+
2447
+ if (workspaceSize > 0 && workspaceAddr == nullptr) {
2448
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
2449
+ workspaceSize);
2450
+ workspaceAddr = workspace_allocator.get();
2451
+ }
2452
+
2453
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2454
+ workspaceAddr, workspaceSize, executor, ctx.stream()));
2455
+
2456
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2457
+ ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2458
+ ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2459
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2460
+ }
2461
+ }
2462
+
2463
+ // cast out
2464
+ int64_t* output_cast_ne = dst->ne;
2465
+ size_t output_cast_nb[GGML_MAX_DIMS];
2466
+ output_cast_nb[0] = sizeof(uint16_t);
2467
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2468
+ output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
2469
+ }
2470
+
2471
+ aclTensor* acl_output_tensor =
2472
+ ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
2473
+ output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2474
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2475
+ aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
2476
+
2477
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2478
+ ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
2479
+ }
2480
+
2481
+ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2482
+ const enum ggml_type type = dst->src[0]->type;
2483
+ switch (type) {
2484
+ case GGML_TYPE_F32:
2485
+ case GGML_TYPE_F16:
2486
+ ggml_cann_mat_mul_fp(ctx, dst);
2487
+ break;
2488
+ // case GGML_TYPE_Q4_0:
2489
+ // ggml_cann_mul_mat_q4_0(ctx, dst);
2490
+ // break;
2491
+ case GGML_TYPE_Q8_0:
2492
+ ggml_cann_mul_mat_q8_0(ctx, dst);
2493
+ break;
2494
+ default:
2495
+ GGML_ABORT("fatal error");
2496
+ break;
2497
+ }
2498
+ }
2499
+
2500
+ /**
2501
+ * @brief Rolls the elements of a tensor along a specified dimension.
2502
+ *
2503
+ * This function rolls the elements of the source tensor `acl_src` by the
2504
+ * specified shifts `shifts` along the specified dimensions `dims`, and stores
2505
+ * the result in the destination tensor `acl_dst`.
2506
+ *
2507
+ * @param ctx The context for the CANN backend operations.
2508
+ * @param acl_src The source tensor whose elements will be rolled.
2509
+ * @param acl_dst The destination tensor where the rolled elements will be
2510
+ * stored.
2511
+ * @param shifts An array specifying the number of positions by which elements
2512
+ * are shifted.
2513
+ * @param dims An array specifying the dimensions along which elements are
2514
+ * shifted.
2515
+ */
2516
+ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2517
+ aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2518
+ aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2519
+ aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2520
+
2521
+ uint64_t workspaceSize = 0;
2522
+ aclOpExecutor* executor;
2523
+ void* workspaceAddr = nullptr;
2524
+
2525
+ ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst,
2526
+ &workspaceSize, &executor));
2527
+ if (workspaceSize > 0) {
2528
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2529
+ workspaceAddr = workspace_allocator.get();
2530
+ }
2531
+
2532
+ ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream()));
2533
+
2534
+ ACL_CHECK(aclDestroyIntArray(acl_shifts));
2535
+ ACL_CHECK(aclDestroyIntArray(acl_dims));
2536
+ }
2537
+
2538
+ /**
2539
+ * @brief Fills specified positions of a tensor with a scalar value.
2540
+ *
2541
+ * This function fills the positions in the source tensor `acl_src` specified by
2542
+ * `index` along the dimension `dim` with the scalar value `value`.
2543
+ *
2544
+ * @param ctx The context for the CANN backend operations.
2545
+ * @param acl_src The source tensor where the positions will be filled.
2546
+ * @param dim The dimension along which the positions are specified.
2547
+ * @param index An array specifying the positions to be filled.
2548
+ * @param index_num The number of positions specified in the index array.
2549
+ * @param value The scalar value used to fill the specified positions.
2550
+ */
2551
+ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2552
+ aclTensor* acl_src, int64_t dim,
2553
+ int64_t* index, int64_t index_num,
2554
+ float value) {
2555
+ aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2556
+ aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2557
+
2558
+ uint64_t workspaceSize = 0;
2559
+ aclOpExecutor* executor;
2560
+ void* workspaceAddr = nullptr;
2561
+
2562
+ ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(
2563
+ acl_src, dim, acl_index, acl_value, &workspaceSize, &executor));
2564
+ if (workspaceSize > 0) {
2565
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2566
+ workspaceAddr = workspace_allocator.get();
2567
+ }
2568
+
2569
+ ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize,
2570
+ executor, ctx.stream()));
2571
+
2572
+ ACL_CHECK(aclDestroyIntArray(acl_index));
2573
+ ACL_CHECK(aclDestroyScalar(acl_value));
2574
+ }
2575
+
2576
+ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2577
+ aclTensor* acl_cos_repeat_tensor,
2578
+ aclTensor* acl_sin_repeat_tensor,
2579
+ float theta_scale, bool is_neox) {
2580
+ // int sin/cos cache, cache has different repeat method depond on
2581
+ // @param.is_neox
2582
+
2583
+ ggml_tensor* src0 = dst->src[0]; // input
2584
+ ggml_tensor* src1 = dst->src[1]; // position
2585
+
2586
+ // arange, [0,1,...,ne0/2]
2587
+ int64_t arange_length = src0->ne[0] / 2;
2588
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(),
2589
+ arange_length * sizeof(float_t));
2590
+ void* arange_buffer = arange_allocator.get();
2591
+ int64_t arange_ne[] = {arange_length, 1, 1, 1};
2592
+ size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
2593
+ arange_length * sizeof(float_t)};
2594
+
2595
+ aclTensor* acl_arange_tensor =
2596
+ ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t),
2597
+ arange_ne, arange_nb, GGML_MAX_DIMS);
2598
+ float start = 0;
2599
+ float step = 1;
2600
+ float stop = src0->ne[0] / 2;
2601
+ float n_elements = src0->ne[0] / 2;
2602
+ aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements);
2603
+
2604
+ // power
2605
+ // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
2606
+ // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
2607
+ // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2608
+ // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
2609
+ // acl_power_tensor);
2610
+ ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
2611
+ arange_length * sizeof(float_t));
2612
+ void* theta_scale_buffer = theta_scale_allocator.get();
2613
+ aclTensor* acl_theta_scale_tensor = aclnn_ones(
2614
+ ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
2615
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
2616
+ aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
2617
+
2618
+ // position
2619
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
2620
+ int64_t position_length = src1->ne[0];
2621
+ int64_t position_ne[] = {1, position_length, 1, 1};
2622
+ size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t),
2623
+ sizeof(int32_t) * position_length,
2624
+ sizeof(int32_t) * position_length};
2625
+ aclTensor* acl_position_tensor = ggml_cann_create_tensor(
2626
+ src1->data, ggml_cann_type_mapping(src1->type),
2627
+ ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
2628
+
2629
+ // power * position
2630
+ int64_t theta_length = arange_length * position_length;
2631
+ ggml_cann_pool_alloc theta_allocator(ctx.pool(),
2632
+ theta_length * sizeof(float_t));
2633
+ void* theta_buffer = theta_allocator.get();
2634
+ int64_t theta_ne[] = {arange_length, position_length, 1, 1};
2635
+ size_t theta_nb[GGML_MAX_DIMS];
2636
+ theta_nb[0] = sizeof(float_t);
2637
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2638
+ theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
2639
+ }
2640
+ aclTensor* acl_theta_tensor =
2641
+ ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
2642
+ theta_ne, theta_nb, GGML_MAX_DIMS);
2643
+ aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
2644
+ acl_theta_tensor);
2645
+
2646
+ // permute: [0,1,2,3]->[0,2,1,3]
2647
+ int64_t permute_ne[] = {arange_length, 1, position_length, 1};
2648
+ size_t permute_nb[GGML_MAX_DIMS];
2649
+ permute_nb[0] = sizeof(float_t);
2650
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2651
+ permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1];
2652
+ }
2653
+ ggml_cann_pool_alloc permute_allocator(ctx.pool(),
2654
+ theta_length * sizeof(float_t));
2655
+ void* permute_buffer = permute_allocator.get();
2656
+ aclTensor* acl_permute_tensor = ggml_cann_create_tensor(
2657
+ permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
2658
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2659
+ int64_t permute_dim[] = {0, 2, 1, 3};
2660
+ int64_t num_dims = 4;
2661
+ aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
2662
+ num_dims);
2663
+
2664
+ // sin/cos
2665
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(),
2666
+ theta_length * sizeof(float_t));
2667
+ void* sin_buffer = sin_allocator.get();
2668
+ aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
2669
+ sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
2670
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2671
+ aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor);
2672
+
2673
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(),
2674
+ theta_length * sizeof(float_t));
2675
+ void* cos_buffer = cos_allocator.get();
2676
+ aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
2677
+ cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
2678
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2679
+ aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
2680
+
2681
+ // repeat
2682
+ if (is_neox) {
2683
+ int64_t repeatsArray[] = {1, 1, 1, 2};
2684
+ aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
2685
+ aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
2686
+ } else {
2687
+ int64_t num_repeats = 2;
2688
+ int64_t dim = 3;
2689
+ int64_t output_size = arange_length * num_repeats;
2690
+ aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
2691
+ num_repeats, output_size);
2692
+ aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
2693
+ num_repeats, output_size);
2694
+ }
2695
+
2696
+ // release
2697
+ ACL_CHECK(aclDestroyTensor(acl_arange_tensor));
2698
+ ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor));
2699
+ ACL_CHECK(aclDestroyTensor(acl_position_tensor));
2700
+ ACL_CHECK(aclDestroyTensor(acl_theta_tensor));
2701
+ ACL_CHECK(aclDestroyTensor(acl_permute_tensor));
2702
+ ACL_CHECK(aclDestroyTensor(acl_sin_tensor));
2703
+ ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
2704
+ }
2705
+
2706
+ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2707
+ // TODO: use ascendc
2708
+ // Only test with LLAMA model.
2709
+ ggml_tensor* src0 = dst->src[0]; // input
2710
+ ggml_tensor* src2 = dst->src[2]; // freq_factors
2711
+
2712
+ // TODO: with freq_factors
2713
+ GGML_ASSERT(src2 == NULL);
2714
+
2715
+ // param
2716
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2717
+ // const int n_past = ((int32_t *) dst->op_params)[0];
2718
+ const int n_dims = ((int32_t*)dst->op_params)[1];
2719
+ const int mode = ((int32_t*)dst->op_params)[2];
2720
+ // const int n_ctx = ((int32_t *) dst->op_params)[3];
2721
+ const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
2722
+
2723
+ GGML_TENSOR_UNARY_OP_LOCALS
2724
+
2725
+ memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
2726
+ memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
2727
+ memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
2728
+ memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
2729
+ memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
2730
+ memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
2731
+
2732
+ GGML_ASSERT(n_dims <= ne0);
2733
+ GGML_ASSERT(n_dims % 2 == 0);
2734
+
2735
+ // TODO: ext_factor != 0
2736
+ GGML_ASSERT(ext_factor == 0);
2737
+ // TODO: freq_scale != 1
2738
+ GGML_ASSERT(freq_scale == 1);
2739
+
2740
+ const float theta_scale = powf(freq_base, -2.0f / n_dims);
2741
+
2742
+ float corr_dims[2];
2743
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
2744
+ beta_slow, corr_dims);
2745
+
2746
+ const bool is_neox = mode & 2;
2747
+
2748
+ // init cos/sin cache
2749
+ ggml_cann_pool_alloc sin_allocator(
2750
+ ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
2751
+ ggml_cann_pool_alloc cos_allocator(
2752
+ ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
2753
+ void* sin_buffer = sin_allocator.get();
2754
+ void* cos_buffer = cos_allocator.get();
2755
+
2756
+ int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
2757
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
2758
+ sin_reshape_nb[0] = sizeof(float_t);
2759
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2760
+ sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
2761
+ }
2762
+ aclTensor* acl_sin_reshape_tensor =
2763
+ ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t),
2764
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2765
+ aclTensor* acl_cos_reshape_tensor =
2766
+ ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
2767
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2768
+ aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2769
+ theta_scale, is_neox);
2770
+
2771
+ // roll input
2772
+ void* input_roll_buffer;
2773
+ aclTensor* acl_minus_one_tensor;
2774
+ void* minus_one_scale_buffer = nullptr;
2775
+ ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
2776
+ ggml_cann_pool_alloc minus_one_scale_allocator(
2777
+ ctx.pool(), sizeof(float_t) * src0->ne[0]);
2778
+ if (!is_neox) {
2779
+ // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2780
+ input_roll_buffer = roll_allocator.get();
2781
+ int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
2782
+ src0->ne[2], src0->ne[3]};
2783
+ size_t input_roll_nb[GGML_MAX_DIMS];
2784
+ input_roll_nb[0] = ggml_type_size(src0->type);
2785
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2786
+ input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
2787
+ }
2788
+ aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2789
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
2790
+ ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2791
+ GGML_MAX_DIMS);
2792
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2793
+ src0->data, ggml_cann_type_mapping(src0->type),
2794
+ ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2795
+ GGML_MAX_DIMS);
2796
+
2797
+ int64_t shifts[] = {1};
2798
+ int64_t dims[] = {3};
2799
+ aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2800
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2801
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2802
+
2803
+ // init [-1, 1, -1, 1, ...]
2804
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
2805
+
2806
+ int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2807
+ size_t minus_one_nb[GGML_MAX_DIMS];
2808
+ minus_one_nb[0] = sizeof(float_t);
2809
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2810
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2811
+ }
2812
+ acl_minus_one_tensor = aclnn_ones(
2813
+ ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2814
+ minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2815
+ int64_t dim = 3;
2816
+ int64_t* index = new int64_t[src0->ne[0]];
2817
+ for (int i = 0; i < src0->ne[0]; i++) {
2818
+ index[i] = i / 2 * 2;
2819
+ }
2820
+ int64_t index_num = src0->ne[0];
2821
+ float value = -1;
2822
+ aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
2823
+ index_num, value);
2824
+ } else {
2825
+ // roll input: [q0,q1,q2,...] ->
2826
+ // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2827
+ input_roll_buffer = roll_allocator.get();
2828
+ aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2829
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
2830
+ ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
2831
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
2832
+
2833
+ int64_t shifts[] = {src0->ne[0] / 2};
2834
+ int64_t dims[] = {3};
2835
+ aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2836
+
2837
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
2838
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2839
+
2840
+ // init [-1, -1, -1, 1, 1,1,...]
2841
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
2842
+
2843
+ int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2844
+ size_t minus_one_nb[GGML_MAX_DIMS];
2845
+ minus_one_nb[0] = sizeof(float_t);
2846
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2847
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2848
+ }
2849
+ acl_minus_one_tensor = aclnn_ones(
2850
+ ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2851
+ minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2852
+ // -1 * first half
2853
+ int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
2854
+ size_t first_half_nb[GGML_MAX_DIMS];
2855
+ first_half_nb[0] = sizeof(float_t);
2856
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2857
+ first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
2858
+ }
2859
+ aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
2860
+ minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
2861
+ first_half_nb, GGML_MAX_DIMS);
2862
+ bool inplace = true;
2863
+ float scale = -1;
2864
+ aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
2865
+ ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
2866
+ }
2867
+
2868
+ // TODO: n_dims < ne0
2869
+ GGML_ASSERT(n_dims == src0->ne[0]);
2870
+
2871
+ // input * scale
2872
+ ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
2873
+ ggml_nbytes(src0));
2874
+ void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
2875
+ size_t input_nb[GGML_MAX_DIMS];
2876
+ input_nb[0] = ggml_type_size(src0->type);
2877
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2878
+ input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
2879
+ }
2880
+ aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
2881
+ input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
2882
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2883
+ aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
2884
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
2885
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2886
+
2887
+ aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
2888
+ acl_input_roll_mul_scale_tensor);
2889
+
2890
+ // output
2891
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
2892
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2893
+ void* output_fp32_buffer;
2894
+ if (src0->type == GGML_TYPE_F32) {
2895
+ aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
2896
+ aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
2897
+ acl_sin_reshape_tensor);
2898
+ aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
2899
+ // TODO: ne0 != n_dims in mode2
2900
+ } else if (src0->type == GGML_TYPE_F16) {
2901
+ size_t input_fp32_nb[GGML_MAX_DIMS];
2902
+ input_fp32_nb[0] = sizeof(float_t);
2903
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2904
+ input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
2905
+ }
2906
+ ggml_cann_pool_alloc fp32_allocator1(
2907
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2908
+ void* input_fp32_buffer1 = fp32_allocator1.get();
2909
+ aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
2910
+ input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
2911
+ input_fp32_nb, GGML_MAX_DIMS);
2912
+ ggml_cann_pool_alloc fp32_allocator2(
2913
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2914
+ void* input_fp32_buffer2 = fp32_allocator2.get();
2915
+ aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
2916
+ input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
2917
+ input_fp32_nb, GGML_MAX_DIMS);
2918
+
2919
+ ggml_cann_pool_alloc fp32_allocator(
2920
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2921
+ output_fp32_buffer = fp32_allocator.get();
2922
+ aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
2923
+ output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
2924
+ input_fp32_nb, GGML_MAX_DIMS);
2925
+ aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
2926
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
2927
+ input_fp32_tensor2);
2928
+ aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
2929
+ output_fp32_tensor);
2930
+ aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
2931
+
2932
+ ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
2933
+ ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
2934
+ ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
2935
+ }
2936
+
2937
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
2938
+ ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
2939
+ ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
2940
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
2941
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
2942
+ ACL_CHECK(aclDestroyTensor(acl_src0));
2943
+ ACL_CHECK(aclDestroyTensor(acl_dst));
2944
+ }