@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -28,8 +28,8 @@
28
28
  #include <aclnnop/aclnn_cast.h>
29
29
  #include <aclnnop/aclnn_constant_pad_nd.h>
30
30
  #include <aclnnop/aclnn_copy.h>
31
- #include <aclnnop/aclnn_cos.h>
32
31
  #include <aclnnop/aclnn_div.h>
32
+ #include <aclnnop/aclnn_embedding.h>
33
33
  #include <aclnnop/aclnn_exp.h>
34
34
  #include <aclnnop/aclnn_fill_scalar.h>
35
35
  #include <aclnnop/aclnn_group_norm.h>
@@ -44,12 +44,27 @@
44
44
  #include <aclnnop/aclnn_repeat.h>
45
45
  #include <aclnnop/aclnn_repeat_interleave.h>
46
46
  #include <aclnnop/aclnn_roll.h>
47
- #include <aclnnop/aclnn_sin.h>
48
47
  #include <aclnnop/aclnn_softmax.h>
49
48
  #include <aclnnop/aclnn_tril.h>
50
49
  #include <aclnnop/aclnn_triu.h>
51
50
  #include <aclnnop/aclnn_upsample_nearest_2d.h>
52
51
  #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
52
+ #include <aclnnop/aclnn_argmax.h>
53
+ #include <aclnnop/aclnn_sum.h>
54
+ #include <aclnnop/aclnn_rms_norm.h>
55
+ #include <aclnnop/aclnn_im2col.h>
56
+ #include <aclnnop/aclnn_add.h>
57
+ #include <aclnnop/aclnn_sub.h>
58
+ #include <aclnnop/aclnn_mul.h>
59
+ #include <aclnnop/aclnn_div.h>
60
+ #include <aclnnop/aclnn_convolution.h>
61
+ #include <aclnnop/aclnn_elu.h>
62
+ #include <aclnnop/aclnn_log.h>
63
+ #include <aclnnop/aclnn_mean.h>
64
+ #include <aclnnop/aclnn_reflection_pad1d.h>
65
+ #include <aclnnop/aclnn_eq_tensor.h>
66
+ #include <aclnnop/aclnn_gt_scalar.h>
67
+ #include <aclnnop/aclnn_pow.h>
53
68
  #include <float.h>
54
69
 
55
70
  #include <cmath>
@@ -58,12 +73,39 @@
58
73
  #include <vector>
59
74
 
60
75
  #include "ggml-impl.h"
61
- #include "kernels/ascendc_kernels.h"
62
76
 
63
77
  #define GGML_COMMON_DECL_C
64
78
 
65
79
  #include "../ggml-common.h"
66
80
 
81
+ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
82
+ aclTensor ** acl_src1, aclTensor ** acl_dst) {
83
+ GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
84
+ // Need bcast
85
+ if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
86
+ BCAST_SHAPE(src0, src1)
87
+ *acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
88
+ *acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
89
+ *acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
90
+ } else {
91
+ *acl_src0 = ggml_cann_create_tensor(src0);
92
+ *acl_src1 = ggml_cann_create_tensor(src1);
93
+ *acl_dst = ggml_cann_create_tensor(dst);
94
+ }
95
+ }
96
+
97
+ void ggml_cann_unary_op(
98
+ std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
99
+ ggml_backend_cann_context& ctx, ggml_tensor* dst) {
100
+ ggml_tensor* src = dst->src[0];
101
+
102
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
103
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
104
+
105
+ unary_op(ctx, acl_src, acl_dst);
106
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
107
+ }
108
+
67
109
  /**
68
110
  * @brief Repeats elements of a tensor along each dimension according to the
69
111
  * specified repeat array.
@@ -79,24 +121,26 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
79
121
  // repeat tensor along each dim with repeat_array
80
122
  aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
81
123
 
82
- uint64_t workspaceSize = 0;
83
- aclOpExecutor* executor;
84
- void* workspaceAddr = nullptr;
85
-
86
- ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
87
- &workspaceSize, &executor));
124
+ GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst);
125
+ ggml_cann_release_resources(ctx, repeats);
126
+ }
88
127
 
89
- if (workspaceSize > 0) {
90
- // Memory from allocator will "free" immediately, and this memory
91
- // will be alloced to other pointers, but it won't access before
92
- // this async task end because all tasks in same stream will execute
93
- // in queue.
94
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
95
- workspaceAddr = workspace_allocator.get();
96
- }
97
- ACL_CHECK(
98
- aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream()));
99
- ACL_CHECK(aclDestroyIntArray(repeats));
128
+ /**
129
+ * @brief Casts the data type of a source tensor to a destination tensor.
130
+ *
131
+ * This function casts the data type of the source tensor `acl_src` to the
132
+ * specified data type `cast_data_type` and stores the result in the destination
133
+ * tensor `acl_dst`.
134
+ *
135
+ * @param ctx The context for the CANN backend operations.
136
+ * @param acl_src The source tensor whose data type will be casted.
137
+ * @param acl_dst The destination tensor where the casted result will be stored.
138
+ * @param cast_data_type The target data type to which the source tensor will be
139
+ * casted.
140
+ */
141
+ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
142
+ aclTensor* acl_dst, aclDataType cast_data_type) {
143
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
100
144
  }
101
145
 
102
146
  void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -110,73 +154,78 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
110
154
  dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
111
155
 
112
156
  aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
113
- ACL_CHECK(aclDestroyTensor(acl_src));
114
- ACL_CHECK(aclDestroyTensor(acl_dst));
157
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
115
158
  }
116
159
 
117
- /**
118
- * @brief Adds two tensors element-wise and stores the result in a destination
119
- * tensor.
120
- *
121
- * This function performs the operation:
122
- * \f[
123
- * dst = acl\_src0 + alpha \times acl\_src1
124
- * \f]
125
- * where alpha is a scalar value and defaults to 1.0f.
126
- *
127
- * @param ctx The context for the CANN backend operations.
128
- * @param acl_src0 The first source tensor.
129
- * @param acl_src1 The second source tensor.
130
- * @param acl_dst The destination tensor where the result will be stored.
131
- */
132
- static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
160
+ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
133
161
  aclTensor* acl_src1, aclTensor* acl_dst) {
134
- aclScalar* alpha = nullptr;
135
162
  float alphaValue = 1.0f;
136
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
137
-
138
- uint64_t workspaceSize = 0;
139
- aclOpExecutor* executor;
140
- void* workspaceAddr = nullptr;
141
-
142
- ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
143
- &workspaceSize, &executor));
144
- if (workspaceSize > 0) {
145
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
146
- workspaceAddr = workspace_allocator.get();
147
- }
148
-
149
- ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
163
+ aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
164
+ if (acl_dst != nullptr)
165
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
166
+ else
167
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha);
168
+ ggml_cann_release_resources(ctx, alpha);
169
+ }
150
170
 
151
- ACL_CHECK(aclDestroyScalar(alpha));
171
+ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
172
+ aclTensor* acl_src1, aclTensor* acl_dst) {
173
+ float alphaValue = 1.0f;
174
+ aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
175
+ if (acl_dst != nullptr)
176
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha, acl_dst);
177
+ else
178
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha);
179
+ ggml_cann_release_resources(ctx, alpha);
152
180
  }
153
181
 
154
- void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
155
- ggml_tensor* src0 = dst->src[0];
156
- ggml_tensor* src1 = dst->src[1];
157
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
182
+ void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
183
+ aclTensor* acl_other, aclTensor* acl_dst) {
184
+ if (acl_dst != nullptr)
185
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
186
+ else
187
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
188
+ }
158
189
 
159
- aclTensor* acl_src0;
160
- aclTensor* acl_src1;
161
- aclTensor* acl_dst;
190
+ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
191
+ aclTensor* acl_other, aclTensor* acl_dst) {
192
+ if (acl_dst != nullptr)
193
+ GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
194
+ else
195
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
196
+ }
162
197
 
163
- // Need bcast
164
- if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
165
- BCAST_SHAPE(src0, src1)
166
- acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
167
- acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
168
- acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
198
+ /**
199
+ * @brief Multiplies elements of a tensor by a scalar value, optionally
200
+ * in-place.
201
+ *
202
+ * This function multiplies each element of the source tensor `acl_src` by the
203
+ * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
204
+ * `inplace` is true, `acl_dst` will not be used and the operation is performed
205
+ * in-place on `acl_src`.
206
+ * The operation is defined as:
207
+ * \f[
208
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
209
+ * \f]
210
+ *
211
+ * @param ctx The context for the CANN backend operations.
212
+ * @param acl_src The source tensor whose elements will be multiplied.
213
+ * @param scale The scalar value by which each element of `acl_src` will be
214
+ * multiplied.
215
+ * @param acl_dst The destination tensor where the result will be stored if
216
+ * `inplace` is false.
217
+ * @param inplace Flag indicating whether to perform the operation in-place on
218
+ * `acl_src`.
219
+ */
220
+ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
221
+ float scale, aclTensor* acl_dst, bool inplace) {
222
+ aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
223
+ if (inplace) {
224
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale);
169
225
  } else {
170
- acl_src0 = ggml_cann_create_tensor(src0);
171
- acl_src1 = ggml_cann_create_tensor(src1);
172
- acl_dst = ggml_cann_create_tensor(dst);
226
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale, acl_dst);
173
227
  }
174
-
175
- aclnn_add(ctx, acl_src0, acl_src1, acl_dst);
176
-
177
- ACL_CHECK(aclDestroyTensor(acl_src0));
178
- ACL_CHECK(aclDestroyTensor(acl_src1));
179
- ACL_CHECK(aclDestroyTensor(acl_dst));
228
+ ggml_cann_release_resources(ctx, acl_scale);
180
229
  }
181
230
 
182
231
  void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -193,23 +242,8 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
193
242
  aclScalar* acl_negative_slope =
194
243
  aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
195
244
 
196
- uint64_t workspaceSize = 0;
197
- aclOpExecutor* executor;
198
- void* workspaceAddr = nullptr;
199
-
200
- ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
201
- acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
202
- if (workspaceSize > 0) {
203
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
204
- workspaceAddr = workspace_allocator.get();
205
- }
206
-
207
- ACL_CHECK(
208
- aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream()));
209
-
210
- ACL_CHECK(aclDestroyScalar(acl_negative_slope));
211
- ACL_CHECK(aclDestroyTensor(acl_src));
212
- ACL_CHECK(aclDestroyTensor(acl_dst));
245
+ GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst);
246
+ ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst);
213
247
  }
214
248
 
215
249
  /**
@@ -225,18 +259,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
225
259
  static void aclnn_concat(ggml_backend_cann_context& ctx,
226
260
  aclTensorList* tensorList, aclTensor* acl_dst,
227
261
  int64_t concat_dim) {
228
- uint64_t workspaceSize = 0;
229
- aclOpExecutor* executor;
230
- void* workspaceAddr = nullptr;
231
-
232
- ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
233
- &workspaceSize, &executor));
234
- if (workspaceSize > 0) {
235
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
236
- workspaceAddr = workspace_allocator.get();
237
- }
238
-
239
- ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream()));
262
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
240
263
  }
241
264
 
242
265
  void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -252,11 +275,10 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
252
275
  int32_t acl_dim = 3 - dim;
253
276
 
254
277
  aclTensor* tensors[] = {acl_src0, acl_src1};
255
- aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
256
- aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
278
+ aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
279
+ aclnn_concat(ctx, tensor_list, acl_dst, acl_dim);
257
280
 
258
- ACL_CHECK(aclDestroyTensorList(tensorList));
259
- ACL_CHECK(aclDestroyTensor(acl_dst));
281
+ ggml_cann_release_resources(ctx, tensor_list, acl_dst);
260
282
  }
261
283
 
262
284
  /**
@@ -282,27 +304,12 @@ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
282
304
  int64_t steps = (int64_t)std::ceil((stop - start) / step);
283
305
  GGML_ASSERT(n_elements == steps);
284
306
 
285
- uint64_t workspaceSize = 0;
286
- aclOpExecutor* executor;
287
- void* workspaceAddr = nullptr;
288
-
289
307
  aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
290
308
  aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
291
309
  aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
292
310
 
293
- ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
294
- &workspaceSize, &executor));
295
- if (workspaceSize > 0) {
296
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
297
- workspaceAddr = workspace_allocator.get();
298
- }
299
-
300
- ACL_CHECK(
301
- aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream()));
302
-
303
- ACL_CHECK(aclDestroyScalar(acl_start));
304
- ACL_CHECK(aclDestroyScalar(acl_end));
305
- ACL_CHECK(aclDestroyScalar(acl_step));
311
+ GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst);
312
+ ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step);
306
313
  }
307
314
 
308
315
  void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -319,18 +326,11 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
319
326
  memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
320
327
 
321
328
  aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
322
- ACL_CHECK(aclDestroyTensor(acl_dst));
323
- }
324
-
325
- void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
326
- dst->src[1] = dst->src[0];
327
- ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
329
+ ggml_cann_release_resources(ctx, acl_dst);
328
330
  }
329
331
 
330
332
  void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
331
333
  ggml_tensor* src = dst->src[0];
332
- GGML_ASSERT(src->type == GGML_TYPE_F32);
333
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
334
334
 
335
335
  float min;
336
336
  float max;
@@ -343,23 +343,8 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
343
343
  aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
344
344
  aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
345
345
 
346
- uint64_t workspaceSize = 0;
347
- aclOpExecutor* executor;
348
- void* workspaceAddr = nullptr;
349
-
350
- ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
351
- &workspaceSize, &executor));
352
- if (workspaceSize > 0) {
353
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
354
- workspaceAddr = workspace_allocator.get();
355
- }
356
-
357
- ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream()));
358
-
359
- ACL_CHECK(aclDestroyScalar(acl_min));
360
- ACL_CHECK(aclDestroyScalar(acl_max));
361
- ACL_CHECK(aclDestroyTensor(acl_src));
362
- ACL_CHECK(aclDestroyTensor(acl_dst));
346
+ GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst);
347
+ ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst);
363
348
  }
364
349
 
365
350
  void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -373,22 +358,8 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
373
358
  aclTensor* acl_src = ggml_cann_create_tensor(src);
374
359
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
375
360
 
376
- uint64_t workspaceSize = 0;
377
- aclOpExecutor* executor;
378
- void* workspaceAddr = nullptr;
379
-
380
- ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
381
- &executor));
382
- if (workspaceSize > 0) {
383
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
384
- workspaceAddr = workspace_allocator.get();
385
- }
386
-
387
- ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
388
-
389
- ACL_CHECK(aclDestroyScalar(scale));
390
- ACL_CHECK(aclDestroyTensor(acl_src));
391
- ACL_CHECK(aclDestroyTensor(acl_dst));
361
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst);
362
+ ggml_cann_release_resources(ctx, scale, acl_src, acl_dst);
392
363
  }
393
364
 
394
365
  void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -403,36 +374,10 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
403
374
  aclTensor* tmp_tensor =
404
375
  ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
405
376
  dst->ne, dst->nb, GGML_MAX_DIMS);
406
-
407
- uint64_t workspaceSize = 0;
408
- aclOpExecutor* executor;
409
- void* workspaceAddr = nullptr;
410
-
411
- ACL_CHECK(aclnnArgsortGetWorkspaceSize(
412
- acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
413
- &workspaceSize, &executor));
414
- if (workspaceSize > 0) {
415
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
416
- workspaceAddr = workspace_allocator.get();
417
- }
418
-
419
- ACL_CHECK(
420
- aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream()));
421
-
422
- workspaceSize = 0;
423
- ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor,
424
- ggml_cann_type_mapping(dst->type),
425
- acl_dst, &workspaceSize, &executor));
426
- if (workspaceSize > 0) {
427
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
428
- workspaceAddr = workspace_allocator.get();
429
- }
430
-
431
- ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
432
-
433
- ACL_CHECK(aclDestroyTensor(acl_src));
434
- ACL_CHECK(aclDestroyTensor(tmp_tensor));
435
- ACL_CHECK(aclDestroyTensor(acl_dst));
377
+ GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false),
378
+ tmp_tensor);
379
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
380
+ ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst);
436
381
  }
437
382
 
438
383
  void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -444,27 +389,11 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
444
389
  float eps;
445
390
  memcpy(&eps, dst->op_params, sizeof(float));
446
391
 
447
- uint64_t workspaceSize = 0;
448
- aclOpExecutor* executor;
449
- void* workspaceAddr = nullptr;
450
-
451
392
  std::vector<int64_t> normData = {dst->ne[0]};
452
393
  aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
453
- ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
454
- eps, acl_dst, nullptr, nullptr,
455
- &workspaceSize, &executor));
456
-
457
- if (workspaceSize > 0) {
458
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
459
- workspaceAddr = workspace_allocator.get();
460
- }
461
-
462
- ACL_CHECK(
463
- aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
464
-
465
- ACL_CHECK(aclDestroyIntArray(norm));
466
- ACL_CHECK(aclDestroyTensor(acl_src));
467
- ACL_CHECK(aclDestroyTensor(acl_dst));
394
+ GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr,
395
+ eps, acl_dst, nullptr, nullptr);
396
+ ggml_cann_release_resources(ctx, norm, acl_src, acl_dst);
468
397
  }
469
398
 
470
399
  void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -478,10 +407,6 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
478
407
  float eps;
479
408
  memcpy(&eps, dst->op_params + 1, sizeof(float));
480
409
 
481
- uint64_t workspaceSize = 0;
482
- aclOpExecutor* executor;
483
- void* workspaceAddr = nullptr;
484
-
485
410
  int64_t N = src->ne[3];
486
411
  int64_t C = src->ne[2];
487
412
  int64_t HxW = src->ne[1] * src->ne[0];
@@ -498,22 +423,9 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
498
423
  aclTensor* acl_rstd_out = ggml_cann_create_tensor(
499
424
  (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
500
425
 
501
- ACL_CHECK(aclnnGroupNormGetWorkspaceSize(
502
- acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst,
503
- acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
504
-
505
- if (workspaceSize > 0) {
506
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
507
- workspaceAddr = workspace_allocator.get();
508
- }
509
-
510
- ACL_CHECK(
511
- aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
512
-
513
- ACL_CHECK(aclDestroyTensor(acl_src));
514
- ACL_CHECK(aclDestroyTensor(acl_dst));
515
- ACL_CHECK(aclDestroyTensor(acl_mean_out));
516
- ACL_CHECK(aclDestroyTensor(acl_rstd_out));
426
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
427
+ acl_dst, acl_mean_out, acl_rstd_out);
428
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out);
517
429
  }
518
430
 
519
431
  void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -536,68 +448,52 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
536
448
  float alphaValue = 1.0f;
537
449
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
538
450
 
539
- uint64_t workspaceSize = 0;
540
- aclOpExecutor* executor;
541
- void* workspaceAddr = nullptr;
542
-
543
451
  if (!inplace) {
544
452
  size_t cpy_size = ggml_nbytes(dst);
545
- ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
546
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
453
+ ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
454
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
547
455
  aclTensor* acl_src0 = ggml_cann_create_tensor(
548
456
  src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
549
- ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
550
- &workspaceSize, &executor));
551
- if (workspaceSize > 0) {
552
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
553
- workspaceAddr = workspace_allocator.get();
554
- }
555
- ACL_CHECK(
556
- aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
557
- ACL_CHECK(aclDestroyTensor(acl_src0));
457
+
458
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
459
+ ggml_cann_release_resources(ctx, acl_src0);
558
460
  } else {
559
- ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
560
- &workspaceSize, &executor));
561
- if (workspaceSize > 0) {
562
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
563
- workspaceAddr = workspace_allocator.get();
564
- }
565
- ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
566
- ctx.stream()));
461
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, acl_src1, alpha);
567
462
  }
568
-
569
- ACL_CHECK(aclDestroyTensor(acl_src1));
570
- ACL_CHECK(aclDestroyTensor(acl_dst));
463
+ ggml_cann_release_resources(ctx, acl_src1, acl_dst);
571
464
  }
572
465
 
573
- void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
466
+ /**
467
+ * @brief Performs sum reduction on a given tensor along specified dimensions.
468
+ *
469
+ * This function reduces the input tensor by summing along the specified dimensions.
470
+ *
471
+ * @param ctx The context for the CANN backend operations.
472
+ * @param dst The destination tensor where the reduced result will be stored.
473
+ * @param dim An array of dimension indices.
474
+ * @param dim_size The number of dimensions.
475
+ */
476
+ static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
477
+ int64_t* dim, size_t dim_size) {
478
+ GGML_ASSERT(dst->ne[0] == 1);
574
479
  ggml_tensor* src = dst->src[0];
575
-
576
480
  aclTensor* acl_src = ggml_cann_create_tensor(src);
577
-
578
- GGML_ASSERT(dst->ne[0] == 1);
579
481
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
482
+ aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size);
580
483
 
581
- int64_t reduce_dims_host[] = {3};
582
- aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1);
583
-
584
- uint64_t workspaceSize = 0;
585
- aclOpExecutor* executor;
586
- void* workspaceAddr = nullptr;
587
-
588
- ACL_CHECK(aclnnReduceSumGetWorkspaceSize(
589
- acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst,
590
- &workspaceSize, &executor));
591
- if (workspaceSize > 0) {
592
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
593
- workspaceAddr = workspace_allocator.get();
594
- }
484
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true,
485
+ ggml_cann_type_mapping(dst->type), acl_dst);
486
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims);
487
+ }
595
488
 
596
- ACL_CHECK(
597
- aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream()));
489
+ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
490
+ int64_t reduce_dims[] = {3};
491
+ aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
492
+ }
598
493
 
599
- ACL_CHECK(aclDestroyTensor(acl_src));
600
- ACL_CHECK(aclDestroyTensor(acl_dst));
494
+ void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
495
+ int64_t reduce_dims[] = {0, 1, 2, 3};
496
+ aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
601
497
  }
602
498
 
603
499
  void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
@@ -611,23 +507,8 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
611
507
  std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
612
508
  auto output_size_array = aclCreateIntArray(output_size.data(), 2);
613
509
 
614
- uint64_t workspaceSize = 0;
615
- aclOpExecutor* executor;
616
- void* workspaceAddr = nullptr;
617
-
618
- ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
619
- acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
620
- if (workspaceSize > 0) {
621
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
622
- workspaceAddr = workspace_allocator.get();
623
- }
624
-
625
- ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor,
626
- ctx.stream()));
627
-
628
- ACL_CHECK(aclDestroyIntArray(output_size_array));
629
- ACL_CHECK(aclDestroyTensor(acl_src));
630
- ACL_CHECK(aclDestroyTensor(acl_dst));
510
+ GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst);
511
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array);
631
512
  }
632
513
 
633
514
  /**
@@ -650,23 +531,8 @@ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
650
531
  aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
651
532
  aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
652
533
 
653
- uint64_t workspaceSize = 0;
654
- aclOpExecutor* executor;
655
- void* workspaceAddr = nullptr;
656
-
657
- ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize(
658
- acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor));
659
-
660
- if (workspaceSize > 0) {
661
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
662
- workspaceAddr = workspace_allocator.get();
663
- }
664
-
665
- ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor,
666
- ctx.stream()));
667
-
668
- ACL_CHECK(aclDestroyIntArray(acl_pad));
669
- ACL_CHECK(aclDestroyScalar(acl_value));
534
+ GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
535
+ ggml_cann_release_resources(ctx, acl_pad, acl_value);
670
536
  }
671
537
 
672
538
  void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -682,9 +548,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
682
548
  0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
683
549
  0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
684
550
  aclnn_pad(ctx, acl_src, acl_dst, paddings);
685
-
686
- ACL_CHECK(aclDestroyTensor(acl_dst));
687
- ACL_CHECK(aclDestroyTensor(acl_src));
551
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
688
552
  }
689
553
 
690
554
  /**
@@ -730,28 +594,15 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
730
594
  bool count_include_pad = true;
731
595
  int64_t divisor_override = 0;
732
596
  int8_t cube_math_type = 0;
597
+ #ifdef ASCEND_310P
598
+ cube_math_type = 1;
599
+ #endif
733
600
 
734
- uint64_t workspaceSize = 0;
735
- aclOpExecutor* executor;
736
- void* workspaceAddr = nullptr;
737
-
738
- ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
739
- acl_src, kernel_size, strides, paddings_avg, ceil_mode,
740
- count_include_pad, divisor_override, cube_math_type, acl_dst,
741
- &workspaceSize, &executor));
742
-
743
- if (workspaceSize > 0) {
744
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
745
- workspaceAddr = workspace_allocator.get();
746
- }
747
- ACL_CHECK(
748
- aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream()));
749
-
750
- ACL_CHECK(aclDestroyTensor(acl_src));
751
- ACL_CHECK(aclDestroyTensor(acl_dst));
752
- ACL_CHECK(aclDestroyIntArray(kernel_size));
753
- ACL_CHECK(aclDestroyIntArray(strides));
754
- ACL_CHECK(aclDestroyIntArray(paddings_avg));
601
+ GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
602
+ ceil_mode, count_include_pad, divisor_override,
603
+ cube_math_type, acl_dst);
604
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides,
605
+ paddings_avg);
755
606
  }
756
607
 
757
608
  /**
@@ -819,29 +670,10 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
819
670
 
820
671
  bool ceil_mode = false;
821
672
  int64_t auto_pads = 0;
822
-
823
- uint64_t workspaceSize = 0;
824
- aclOpExecutor* executor;
825
- void* workspaceAddr = nullptr;
826
-
827
- ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
828
- tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
829
- ceil_mode, acl_dst, &workspaceSize, &executor));
830
- if (workspaceSize > 0) {
831
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
832
- workspaceAddr = workspace_allocator.get();
833
- }
834
-
835
- ACL_CHECK(
836
- aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream()));
837
-
838
- ACL_CHECK(aclDestroyTensor(acl_src));
839
- ACL_CHECK(aclDestroyTensor(acl_dst));
840
- ACL_CHECK(aclDestroyTensor(tmp_tensor));
841
- ACL_CHECK(aclDestroyIntArray(kernel_size));
842
- ACL_CHECK(aclDestroyIntArray(strides));
843
- ACL_CHECK(aclDestroyIntArray(paddings_max));
844
- ACL_CHECK(aclDestroyIntArray(dilations));
673
+ GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
674
+ paddings_max, dilations, ceil_mode, acl_dst);
675
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size,
676
+ strides, paddings_max, dilations);
845
677
  }
846
678
 
847
679
  void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -872,207 +704,77 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
872
704
  */
873
705
  static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
874
706
  aclTensor* acl_dst) {
875
- uint64_t workspaceSize = 0;
876
- aclOpExecutor* executor;
877
- void* workspaceAddr = nullptr;
878
-
879
- ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize,
880
- &executor));
881
-
882
- if (workspaceSize > 0) {
883
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
884
- workspaceAddr = workspace_allocator.get();
885
- }
886
-
887
- ACL_CHECK(
888
- aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream()));
707
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
889
708
  }
890
709
 
891
710
  void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
892
- ggml_tensor* src = dst->src[0];
711
+ ggml_tensor* src0 = dst->src[0];
893
712
 
894
- aclTensor* acl_src = ggml_cann_create_tensor(src);
713
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
895
714
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
896
-
897
- ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
898
- ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
899
- src->extra = src_extra_allocator.get();
900
- dst->extra = dst_extra_allocator.get();
901
- ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
902
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
903
- ctx.stream()));
904
- ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
905
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
906
- ctx.stream()));
907
-
908
- if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
909
- ggml_are_same_shape(src, dst)) {
910
- cann_copy(ctx, acl_src, acl_dst);
911
- ACL_CHECK(aclDestroyTensor(acl_src));
912
- ACL_CHECK(aclDestroyTensor(acl_dst));
913
- return;
914
- }
915
- // TODO: simplify
916
- if (src->type == GGML_TYPE_F16) {
917
- if (dst->type == GGML_TYPE_Q8_0) {
918
- aclrtlaunch_ascendc_quantize_f16_q8_0(
919
- 24, ctx.stream(), src->data, dst->data,
920
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
921
- ((ggml_tensor*)dst->extra)->ne);
922
- return;
923
- }
924
- if (dst->type == GGML_TYPE_Q4_0) {
925
- aclrtlaunch_ascendc_quantize_f16_to_q4_0(
926
- 24, ctx.stream(), src->data, dst->data,
927
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
928
- ((ggml_tensor*)dst->extra)->ne);
929
- return;
930
- }
931
- if (dst->type == GGML_TYPE_F16) {
932
- if (ggml_are_same_shape(src, dst)) {
933
- cann_copy(ctx, acl_src, acl_dst);
934
- ACL_CHECK(aclDestroyTensor(acl_src));
935
- ACL_CHECK(aclDestroyTensor(acl_dst));
936
- return;
937
- }
938
- if (ggml_is_contiguous(dst)) {
939
- const size_t src_type_size = ggml_type_size(src->type);
940
- if (src->nb[0] == src_type_size) {
941
- // src0 is contigous on first dimension, copy by rows
942
- int64_t rows_num = ggml_nrows(src);
943
-
944
- aclrtlaunch_ascendc_dup_by_rows_fp16(
945
- rows_num, ctx.stream(), src->data, dst->data,
946
- ((ggml_tensor*)src->extra)->ne,
947
- ((ggml_tensor*)src->extra)->nb,
948
- ((ggml_tensor*)dst->extra)->ne,
949
- ((ggml_tensor*)dst->extra)->nb);
950
- return;
951
- }
952
- GGML_ABORT("fatal error");
953
- }
954
- GGML_ABORT("fatal error");
955
- }
956
- if (dst->type == GGML_TYPE_F32) {
957
- if (ggml_are_same_shape(src, dst)) {
958
- cann_copy(ctx, acl_src, acl_dst);
959
- ACL_CHECK(aclDestroyTensor(acl_src));
960
- ACL_CHECK(aclDestroyTensor(acl_dst));
961
- return;
962
- }
963
- if (ggml_is_contiguous(dst)) {
964
- const size_t src_type_size = ggml_type_size(src->type);
965
- if (src->nb[0] == src_type_size) {
966
- // src0 is contigous on first dimension, copy by rows
967
- int64_t rows_num = ggml_nrows(src);
968
- aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
969
- rows_num, ctx.stream(), src->data, dst->data,
970
- ((ggml_tensor*)src->extra)->ne,
971
- ((ggml_tensor*)src->extra)->nb,
972
- ((ggml_tensor*)dst->extra)->ne,
973
- ((ggml_tensor*)dst->extra)->nb);
974
- return;
975
- }
976
- GGML_ABORT("fatal error");
977
- }
978
- GGML_ABORT("fatal error");
979
- }
980
- // TODO
981
- GGML_ABORT("fatal error");
982
- } else if (src->type == GGML_TYPE_F32) {
983
- // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
984
- // && nb0 == type_size)
985
- if (dst->type == GGML_TYPE_Q8_0) {
986
- aclrtlaunch_ascendc_quantize_f32_q8_0(
987
- 24, ctx.stream(), src->data, dst->data,
988
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
989
- ((ggml_tensor*)dst->extra)->ne);
990
- return;
991
- }
992
- if (dst->type == GGML_TYPE_Q4_0) {
993
- aclrtlaunch_ascendc_quantize_f32_to_q4_0(
994
- 24, ctx.stream(), src->data, dst->data,
995
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
996
- ((ggml_tensor*)dst->extra)->ne);
997
- return;
715
+ if (ggml_are_same_shape(src0, dst)) {
716
+ if (dst->type == src0->type) {
717
+ cann_copy(ctx, acl_src, acl_dst);
718
+ } else {
719
+ aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
998
720
  }
999
- if (dst->type == GGML_TYPE_F32) {
1000
- if (ggml_are_same_shape(src, dst)) {
1001
- cann_copy(ctx, acl_src, acl_dst);
1002
- ACL_CHECK(aclDestroyTensor(acl_src));
1003
- ACL_CHECK(aclDestroyTensor(acl_dst));
721
+ } else {
722
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
723
+ if (dst->type == src0->type) {
724
+ size_t cpy_size = ggml_nbytes(dst);
725
+ ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
726
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
1004
727
  return;
1005
- }
1006
- if (ggml_is_contiguous(dst)) {
1007
- const size_t src_type_size = ggml_type_size(src->type);
1008
- if (src->nb[0] == src_type_size) {
1009
- // src0 is contigous on first dimension, copy by rows
1010
- int64_t rows_num = ggml_nrows(src);
1011
- aclrtlaunch_ascendc_dup_by_rows_fp32(
1012
- rows_num, ctx.stream(), src->data, dst->data,
1013
- ((ggml_tensor*)src->extra)->ne,
1014
- ((ggml_tensor*)src->extra)->nb,
1015
- ((ggml_tensor*)dst->extra)->ne,
1016
- ((ggml_tensor*)dst->extra)->nb);
1017
- return;
1018
- }
1019
- GGML_ABORT("fatal error");
1020
728
  } else {
1021
- // TODO: dst not contiguous
1022
- GGML_ABORT("fatal error");
1023
- }
1024
- }
1025
- if (dst->type == GGML_TYPE_F16) {
1026
- if (ggml_are_same_shape(src, dst)) {
1027
- cann_copy(ctx, acl_src, acl_dst);
1028
- ACL_CHECK(aclDestroyTensor(acl_src));
1029
- ACL_CHECK(aclDestroyTensor(acl_dst));
729
+ ggml_cann_pool_alloc src_buffer_allocator(
730
+ ctx.pool(),
731
+ ggml_nelements(dst) * ggml_type_size(dst->type));
732
+ void* src_trans_buffer = src_buffer_allocator.get();
733
+ size_t src_trans_nb[GGML_MAX_DIMS];
734
+ src_trans_nb[0] = ggml_type_size(dst->type);
735
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
736
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
737
+ }
738
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
739
+ src_trans_buffer, ggml_cann_type_mapping(dst->type),
740
+ ggml_type_size(dst->type), src0->ne, src_trans_nb,
741
+ GGML_MAX_DIMS);
742
+
743
+ aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
744
+ size_t cpy_size = ggml_nbytes(dst);
745
+ ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
746
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
747
+ ggml_cann_release_resources(ctx, src_trans_tensor);
1030
748
  return;
1031
749
  }
1032
- if (ggml_is_contiguous(dst)) {
1033
- const size_t src_type_size = ggml_type_size(src->type);
1034
- if (src->nb[0] == src_type_size) {
1035
- // src0 is contigous on first dimension, copy by rows
1036
- int64_t rows_num = ggml_nrows(src);
1037
- aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
1038
- rows_num, ctx.stream(), src->data, dst->data,
1039
- ((ggml_tensor*)src->extra)->ne,
1040
- ((ggml_tensor*)src->extra)->nb,
1041
- ((ggml_tensor*)dst->extra)->ne,
1042
- ((ggml_tensor*)dst->extra)->nb);
1043
- return;
1044
- }
1045
- GGML_ABORT("fatal error");
750
+ } else if (ggml_is_contiguous(dst)) {
751
+ ggml_cann_pool_alloc src_buffer_allocator(
752
+ ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
753
+ void* src_trans_buffer = src_buffer_allocator.get();
754
+ size_t src_trans_nb[GGML_MAX_DIMS];
755
+ src_trans_nb[0] = ggml_type_size(dst->type);
756
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
757
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1046
758
  }
1047
- }
1048
- // TODO
1049
- GGML_ABORT("fatal error");
1050
- } else {
1051
- if (ggml_are_same_shape(src, dst)) {
1052
- cann_copy(ctx, acl_src, acl_dst);
1053
- ACL_CHECK(aclDestroyTensor(acl_src));
1054
- ACL_CHECK(aclDestroyTensor(acl_dst));
759
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
760
+ src_trans_buffer, ggml_cann_type_mapping(dst->type),
761
+ ggml_type_size(dst->type), src0->ne, src_trans_nb,
762
+ GGML_MAX_DIMS);
763
+
764
+ aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
765
+
766
+ size_t cpy_size = ggml_nbytes(dst);
767
+ ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
768
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
769
+ ggml_cann_release_resources(ctx, src_trans_tensor);
1055
770
  return;
771
+ } else {
772
+ GGML_ABORT("Unsupport dst is not tontiguous.");
1056
773
  }
1057
- GGML_ABORT("fatal error");
1058
774
  }
775
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
1059
776
  }
1060
777
 
1061
- #ifdef __cplusplus
1062
- extern "C" {
1063
- #endif
1064
- aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x,
1065
- const aclTensor* gamma, double epsilon,
1066
- const aclTensor* yOut,
1067
- const aclTensor* rstdOout,
1068
- uint64_t* workspaceSize,
1069
- aclOpExecutor** executor);
1070
- aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
1071
- aclOpExecutor* executor, aclrtStream stream);
1072
- #ifdef __cplusplus
1073
- }
1074
- #endif
1075
-
1076
778
  /**
1077
779
  * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
1078
780
  *
@@ -1098,7 +800,7 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
1098
800
  nb[i] = nb[i - 1] * ne[i - 1];
1099
801
  }
1100
802
 
1101
- ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
803
+ ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
1102
804
  aclTensor* zero =
1103
805
  ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
1104
806
  return zero;
@@ -1131,21 +833,7 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
1131
833
  float alpha_host = 1.0f;
1132
834
  aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
1133
835
  aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
1134
-
1135
- uint64_t workspaceSize = 0;
1136
- aclOpExecutor* executor;
1137
- void* workspaceAddr = nullptr;
1138
-
1139
- ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha,
1140
- &workspaceSize, &executor));
1141
-
1142
- if (workspaceSize > 0) {
1143
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1144
- workspaceAddr = workspace_allocator.get();
1145
- }
1146
- ACL_CHECK(
1147
- aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream()));
1148
-
836
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor, other, alpha);
1149
837
  return acl_tensor;
1150
838
  }
1151
839
 
@@ -1157,13 +845,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1157
845
 
1158
846
  float eps;
1159
847
  memcpy(&eps, dst->op_params, sizeof(float));
1160
-
1161
- GGML_ASSERT(eps > 0.0f);
1162
-
1163
- uint64_t workspaceSize = 0;
1164
- aclOpExecutor* executor;
1165
- void* workspaceAddr = nullptr;
1166
-
1167
848
  size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
1168
849
  ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
1169
850
 
@@ -1178,22 +859,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1178
859
  aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
1179
860
  src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
1180
861
  ggml_element_size(src));
1181
-
1182
- ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
1183
- acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
1184
-
1185
- if (workspaceSize > 0) {
1186
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1187
- workspaceAddr = workspace_allocator.get();
1188
- }
1189
-
1190
- ACL_CHECK(
1191
- aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
1192
-
1193
- ACL_CHECK(aclDestroyTensor(acl_src));
1194
- ACL_CHECK(aclDestroyTensor(acl_dst));
1195
- ACL_CHECK(aclDestroyTensor(acl_gamma));
1196
- ACL_CHECK(aclDestroyTensor(acl_rstd));
862
+ GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
863
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
1197
864
  }
1198
865
 
1199
866
  // TODO: performace is low.
@@ -1215,75 +882,14 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
1215
882
  src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
1216
883
  ggml_element_size(src), value);
1217
884
 
1218
- uint64_t workspaceSize = 0;
1219
- aclOpExecutor* executor;
1220
- void* workspaceAddr = nullptr;
1221
-
1222
- ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
1223
- &workspaceSize, &executor));
1224
- if (workspaceSize > 0) {
1225
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1226
- workspaceAddr = workspace_allocator.get();
1227
- }
1228
-
1229
- ACL_CHECK(
1230
- aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
1231
-
1232
- ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
1233
- &workspaceSize, &executor));
1234
- if (workspaceSize > 0) {
1235
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1236
- workspaceAddr = workspace_allocator.get();
1237
- }
1238
-
1239
- ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream()));
1240
-
1241
885
  aclScalar* alpha = nullptr;
1242
886
  float alphaValue = 1.0f;
1243
887
  alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
1244
888
 
1245
- ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha,
1246
- &workspaceSize, &executor));
1247
- if (workspaceSize > 0) {
1248
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1249
- workspaceAddr = workspace_allocator.get();
1250
- }
1251
- ACL_CHECK(
1252
- aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
1253
-
1254
- ACL_CHECK(aclDestroyScalar(alpha));
1255
- ACL_CHECK(aclDestroyTensor(mask_tensor));
1256
- ACL_CHECK(aclDestroyTensor(acl_src));
1257
- ACL_CHECK(aclDestroyTensor(acl_dst));
1258
- }
1259
-
1260
- /**
1261
- * @brief Casts the data type of a source tensor to a destination tensor.
1262
- *
1263
- * This function casts the data type of the source tensor `acl_src` to the
1264
- * specified data type `cast_data_type` and stores the result in the destination
1265
- * tensor `acl_dst`.
1266
- *
1267
- * @param ctx The context for the CANN backend operations.
1268
- * @param acl_src The source tensor whose data type will be casted.
1269
- * @param acl_dst The destination tensor where the casted result will be stored.
1270
- * @param cast_data_type The target data type to which the source tensor will be
1271
- * casted.
1272
- */
1273
- static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1274
- aclTensor* acl_dst, aclDataType cast_data_type) {
1275
- uint64_t workspaceSize = 0;
1276
- aclOpExecutor* executor;
1277
- void* workspaceAddr = nullptr;
1278
-
1279
- ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
1280
- &workspaceSize, &executor));
1281
- if (workspaceSize > 0) {
1282
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1283
- workspaceAddr = workspace_allocator.get();
1284
- }
1285
-
1286
- ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
889
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1);
890
+ GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst);
891
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, mask_tensor, alpha);
892
+ ggml_cann_release_resources(ctx, alpha, acl_src, acl_dst, mask_tensor);
1287
893
  }
1288
894
 
1289
895
  /**
@@ -1304,39 +910,9 @@ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1304
910
  static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1305
911
  aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
1306
912
  aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
1307
-
1308
- uint64_t workspaceSize = 0;
1309
- aclOpExecutor* executor;
1310
- void* workspaceAddr = nullptr;
1311
-
1312
- ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
1313
- &workspaceSize, &executor));
1314
- if (workspaceSize > 0) {
1315
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1316
- workspaceAddr = workspace_allocator.get();
1317
- }
1318
-
1319
- ACL_CHECK(
1320
- aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
1321
-
1322
- ACL_CHECK(aclDestroyIntArray(acl_dims));
1323
- }
1324
-
1325
- #ifdef __cplusplus
1326
- extern "C" {
1327
- #endif
1328
- aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
1329
- const aclIntArray* kernelSize,
1330
- const aclIntArray* dilation,
1331
- const aclIntArray* padding,
1332
- const aclIntArray* stride,
1333
- aclTensor* out, uint64_t* workspaceSize,
1334
- aclOpExecutor** executor);
1335
- aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
1336
- aclOpExecutor* executor, aclrtStream stream);
1337
- #ifdef __cplusplus
913
+ GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims, acl_dst);
914
+ ggml_cann_release_resources(ctx, acl_dims);
1338
915
  }
1339
- #endif
1340
916
 
1341
917
  static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
1342
918
  ggml_tensor* dst,
@@ -1356,8 +932,7 @@ static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
1356
932
  aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
1357
933
  }
1358
934
 
1359
- // release
1360
- ACL_CHECK(aclDestroyTensor(acl_dst));
935
+ ggml_cann_release_resources(ctx, acl_dst);
1361
936
  }
1362
937
 
1363
938
  static void ggml_cann_im2col_1d_post_process(
@@ -1379,7 +954,6 @@ static void ggml_cann_im2col_1d_post_process(
1379
954
 
1380
955
  // Permute: [N, IC * KH * KW, OW * OH] ->
1381
956
  // [N, OW * OH * n_bytes_factor, IC * KH * KW]
1382
- aclTensor* tmp_permute_tensor = nullptr;
1383
957
  ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
1384
958
  tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1385
959
  void* tmp_permute_buffer = tmp_permute_allocator.get();
@@ -1391,7 +965,7 @@ static void ggml_cann_im2col_1d_post_process(
1391
965
  tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1392
966
  }
1393
967
 
1394
- tmp_permute_tensor = ggml_cann_create_tensor(
968
+ aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
1395
969
  tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
1396
970
  ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
1397
971
  GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
@@ -1421,9 +995,8 @@ static void ggml_cann_im2col_1d_post_process(
1421
995
  c * KH * KW * n_step_w * ggml_type_size(dst->type);
1422
996
 
1423
997
  for (int i = 0; i < n_step_w; i++) {
1424
- ACL_CHECK(aclrtMemcpyAsync(
1425
- cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
1426
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
998
+ ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy,
999
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
1427
1000
  cur_dst_buffer =
1428
1001
  (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1429
1002
  cur_permute_buffer = (char*)cur_permute_buffer +
@@ -1433,13 +1006,11 @@ static void ggml_cann_im2col_1d_post_process(
1433
1006
  } else {
1434
1007
  offset = KH * KW * n_step_w *
1435
1008
  ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1436
- ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
1437
- (char*)tmp_permute_buffer + offset, offset,
1438
- ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1009
+ ggml_cann_async_memcpy(ctx, dst->data, (char*)tmp_permute_buffer + offset, offset,
1010
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
1439
1011
  }
1440
1012
 
1441
- // release
1442
- ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
1013
+ ggml_cann_release_resources(ctx, tmp_permute_tensor);
1443
1014
  }
1444
1015
 
1445
1016
  void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -1501,23 +1072,8 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1501
1072
  auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1502
1073
  auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1503
1074
  auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1504
-
1505
- uint64_t workspaceSize = 0;
1506
- aclOpExecutor* executor;
1507
- void* workspaceAddr = nullptr;
1508
-
1509
- ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
1510
- paddings, strides, tmp_im2col_tensor,
1511
- &workspaceSize, &executor));
1512
-
1513
- ggml_cann_pool_alloc workspace_allocator(ctx.pool());
1514
- if (workspaceSize > 0) {
1515
- workspace_allocator.alloc(workspaceSize);
1516
- workspaceAddr = workspace_allocator.get();
1517
- }
1518
-
1519
- ACL_CHECK(
1520
- aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream()));
1075
+ GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations,
1076
+ paddings, strides, tmp_im2col_tensor);
1521
1077
 
1522
1078
  // Cast if dst is f16.
1523
1079
  aclTensor* tmp_cast_tensor = nullptr;
@@ -1532,328 +1088,53 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1532
1088
  temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
1533
1089
  }
1534
1090
 
1535
- tmp_cast_tensor = ggml_cann_create_tensor(
1536
- tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
1537
- ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
1538
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1539
- aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
1540
- ggml_cann_type_mapping(dst->type));
1541
- }
1542
-
1543
- // post-processing
1544
- if (is_2D) {
1545
- ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
1546
- tmp_im2col_tensor);
1547
- } else {
1548
- std::vector<int64_t> im2col_op_params = {
1549
- KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
1550
- ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
1551
- tmp_im2col_tensor, im2col_op_params);
1552
- }
1553
-
1554
- // release
1555
- ACL_CHECK(aclDestroyTensor(acl_src1));
1556
- ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
1557
- ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
1558
- ACL_CHECK(aclDestroyIntArray(kernel_size));
1559
- ACL_CHECK(aclDestroyIntArray(dilations));
1560
- ACL_CHECK(aclDestroyIntArray(paddings));
1561
- ACL_CHECK(aclDestroyIntArray(strides));
1562
- }
1563
-
1564
- /**
1565
- * @brief Applies element-wise exponential function to the elements of a tensor.
1566
- *
1567
- * This function computes the exponential of each element in the source tensor
1568
- * `acl_src` and stores the result back into the same tensor.
1569
- * The operation is defined as:
1570
- * \f[
1571
- * \text {acl_src }_i=e^{acl\_src_i}
1572
- * \f]
1573
- *
1574
- * @param ctx The context for the CANN backend operations.
1575
- * @param acl_src The tensor on which the exponential function will be applied.
1576
- */
1577
- static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1578
- uint64_t workspaceSize = 0;
1579
- aclOpExecutor* executor;
1580
- void* workspaceAddr = nullptr;
1581
-
1582
- ACL_CHECK(
1583
- aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor));
1584
- if (workspaceSize > 0) {
1585
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1586
- workspaceAddr = workspace_allocator.get();
1587
- }
1588
-
1589
- ACL_CHECK(
1590
- aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
1591
- }
1592
-
1593
- /**
1594
- * @brief Multiplies elements of a tensor by a scalar value, optionally
1595
- * in-place.
1596
- *
1597
- * This function multiplies each element of the source tensor `acl_src` by the
1598
- * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
1599
- * `inplace` is true, `acl_dst` will not be used and the operation is performed
1600
- * in-place on `acl_src`.
1601
- * The operation is defined as:
1602
- * \f[
1603
- * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
1604
- * \f]
1605
- *
1606
- * @param ctx The context for the CANN backend operations.
1607
- * @param acl_src The source tensor whose elements will be multiplied.
1608
- * @param scale The scalar value by which each element of `acl_src` will be
1609
- * multiplied.
1610
- * @param acl_dst The destination tensor where the result will be stored if
1611
- * `inplace` is false.
1612
- * @param inplace Flag indicating whether to perform the operation in-place on
1613
- * `acl_src`.
1614
- */
1615
- static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1616
- float scale, aclTensor* acl_dst, bool inplace) {
1617
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
1618
-
1619
- uint64_t workspaceSize = 0;
1620
- aclOpExecutor* executor;
1621
- void* workspaceAddr = nullptr;
1622
-
1623
- if (inplace) {
1624
- ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
1625
- &workspaceSize, &executor));
1626
- if (workspaceSize > 0) {
1627
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1628
- workspaceAddr = workspace_allocator.get();
1629
- }
1630
-
1631
- ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
1632
- ctx.stream()));
1633
- } else {
1634
- ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
1635
- &workspaceSize, &executor));
1636
- if (workspaceSize > 0) {
1637
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1638
- workspaceAddr = workspace_allocator.get();
1639
- }
1640
-
1641
- ACL_CHECK(
1642
- aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
1643
- }
1644
-
1645
- ACL_CHECK(aclDestroyScalar(acl_scale));
1646
- }
1647
-
1648
- /**
1649
- * @brief Performs an in-place element-wise multiplication of two tensors.
1650
- *
1651
- * This function performs an element-wise multiplication of the tensors
1652
- * `acl_src` and `acl_other` and stores the result in `acl_src`.
1653
- * The operation is defined as:
1654
- * \f[
1655
- * \text {acl_src }_i=\text {acl_src }_i \times \text {acl_other }_i
1656
- * \f]
1657
- *
1658
- * @param ctx The context for the CANN backend operations.
1659
- * @param acl_src The source tensor where the multiplication result will be
1660
- * stored.
1661
- * @param acl_other The tensor whose elements will be multiplied with `acl_src`.
1662
- */
1663
- static void aclnn_inplace_mul(ggml_backend_cann_context& ctx,
1664
- aclTensor* acl_src, aclTensor* acl_other) {
1665
- uint64_t workspaceSize = 0;
1666
- aclOpExecutor* executor;
1667
- void* workspaceAddr = nullptr;
1668
-
1669
- ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
1670
- &workspaceSize, &executor));
1671
- if (workspaceSize > 0) {
1672
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1673
- workspaceAddr = workspace_allocator.get();
1674
- }
1675
-
1676
- ACL_CHECK(
1677
- aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1678
- }
1679
-
1680
- /**
1681
- * @brief Performs element-wise multiplication of two tensors and stores the
1682
- * result in a destination tensor.
1683
- *
1684
- * This function performs element-wise multiplication of the tensors `acl_src`
1685
- * and `acl_other` and stores the result in the destination tensor `acl_dst`.
1686
- * The operation is defined as:
1687
- * \f[
1688
- * \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
1689
- * \f]
1690
- *
1691
- * @param ctx The context for the CANN backend operations.
1692
- * @param acl_src The first tensor for element-wise multiplication.
1693
- * @param acl_other The second tensor for element-wise multiplication.
1694
- * @param acl_dst The destination tensor where the result will be stored.
1695
- */
1696
- static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1697
- aclTensor* acl_other, aclTensor* acl_dst) {
1698
- uint64_t workspaceSize = 0;
1699
- aclOpExecutor* executor;
1700
- void* workspaceAddr = nullptr;
1701
-
1702
- ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
1703
- &workspaceSize, &executor));
1704
- if (workspaceSize > 0) {
1705
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1706
- workspaceAddr = workspace_allocator.get();
1707
- }
1708
-
1709
- ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1710
- }
1711
-
1712
- /**
1713
- * @brief Applies element-wise cosine function to the elements of a tensor.
1714
- *
1715
- * This function computes the cosine of each element in the source tensor
1716
- * `acl_src` and stores the result in the destination tensor `acl_dst`. The
1717
- * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
1718
- * }_i\right) \f]
1719
- *
1720
- * @param ctx The context for the CANN backend operations.
1721
- * @param acl_src The source tensor on which the cosine function will be
1722
- * applied.
1723
- * @param acl_dst The destination tensor where the cosine results will be
1724
- * stored.
1725
- */
1726
- static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1727
- aclTensor* acl_dst) {
1728
- uint64_t workspaceSize = 0;
1729
- aclOpExecutor* executor;
1730
- void* workspaceAddr = nullptr;
1731
-
1732
- ACL_CHECK(
1733
- aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
1734
- if (workspaceSize > 0) {
1735
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1736
- workspaceAddr = workspace_allocator.get();
1737
- }
1738
-
1739
- ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
1740
- }
1741
-
1742
- /**
1743
- * @brief Applies element-wise sine function to the elements of a tensor.
1744
- *
1745
- * This function computes the sine of each element in the source tensor
1746
- `acl_src`
1747
- * and stores the result in the destination tensor `acl_dst`.
1748
- * The operation is defined as:
1749
- * \f[
1750
- * \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
1751
- * \f]
1091
+ tmp_cast_tensor = ggml_cann_create_tensor(
1092
+ tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
1093
+ ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
1094
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1095
+ aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping(dst->type));
1096
+ }
1752
1097
 
1753
- * @param ctx The context for the CANN backend operations.
1754
- * @param acl_src The source tensor on which the sine function will be applied.
1755
- * @param acl_dst The destination tensor where the sine results will be stored.
1756
- */
1757
- static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1758
- aclTensor* acl_dst) {
1759
- uint64_t workspaceSize = 0;
1760
- aclOpExecutor* executor;
1761
- void* workspaceAddr = nullptr;
1762
-
1763
- ACL_CHECK(
1764
- aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
1765
- if (workspaceSize > 0) {
1766
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1767
- workspaceAddr = workspace_allocator.get();
1098
+ // post-processing
1099
+ if (is_2D) {
1100
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
1101
+ tmp_im2col_tensor);
1102
+ } else {
1103
+ std::vector<int64_t> im2col_op_params = {
1104
+ KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
1105
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
1106
+ tmp_im2col_tensor, im2col_op_params);
1768
1107
  }
1769
1108
 
1770
- ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
1109
+ ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor,
1110
+ kernel_size, dilations, paddings, strides);
1771
1111
  }
1772
1112
 
1773
1113
  /**
1774
- * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
1775
- result by the scalar value and adds it to self .
1114
+ * @brief Applies element-wise exponential function to the elements of a tensor.
1776
1115
  *
1777
- * Performs element-wise division of tensor1 by tensor2,
1778
- * multiplies the result by the scalar value and adds it to self .
1116
+ * This function computes the exponential of each element in the source tensor
1117
+ * `acl_src` and stores the result back into the same tensor.
1779
1118
  * The operation is defined as:
1780
1119
  * \f[
1781
- * \text{out}_i = \text{selft}_i + \text{value} \times
1782
- \frac{\text{tensor1}_i}{\text{tensor2}_i}
1783
- * \f]
1784
-
1785
- * @param ctx The context for the CANN backend operations.
1786
- * @param acl_self The source tensor on which the addcdiv function will be
1787
- applied.
1788
- * @param tensor1 Numerator tensor.
1789
- * @param tensor2 Denominator tensor.
1790
- * @param value The value to be used for coefficient.
1791
- */
1792
- static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
1793
- aclTensor* acl_self, aclTensor* tensor1,
1794
- aclTensor* tensor2, float value) {
1795
- uint64_t workspaceSize = 0;
1796
- aclOpExecutor* executor;
1797
- void* workspaceAddr = nullptr;
1798
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
1799
-
1800
- ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
1801
- acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
1802
- if (workspaceSize > 0) {
1803
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1804
- workspaceAddr = workspace_allocator.get();
1805
- }
1806
-
1807
- ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
1808
- ctx.stream()));
1809
- }
1810
-
1811
- /**
1812
- * @brief Matrix division, optionally in-place.
1813
- *
1814
- * This function division each element of the source tensor `acl_src` by the
1815
- * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
1816
- * If `inplace` is true, `acl_dst` will not be used and the operation is
1817
- * performed in-place on `acl_src`. The operation is defined as: \f[
1818
- * \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
1120
+ * \text {acl_src }_i=e^{acl\_src_i}
1819
1121
  * \f]
1820
1122
  *
1821
1123
  * @param ctx The context for the CANN backend operations.
1822
- * @param acl_src Numerator tensor..
1823
- * @param acl_other Denominator tensor.
1824
- * @param acl_dst The destination tensor where the result will be stored if
1825
- * `inplace` is false.
1826
- * @param inplace Flag indicating whether to perform the operation in-place on
1827
- * `acl_src`.
1124
+ * @param acl_src The tensor on which the exponential function will be applied.
1828
1125
  */
1829
- static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1830
- aclTensor* acl_other, aclTensor* acl_dst,
1831
- bool inplace) {
1832
- uint64_t workspaceSize = 0;
1833
- aclOpExecutor* executor;
1834
- void* workspaceAddr = nullptr;
1835
-
1836
- if (inplace) {
1837
- ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
1838
- &workspaceSize, &executor));
1839
- if (workspaceSize > 0) {
1840
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1841
- workspaceAddr = workspace_allocator.get();
1842
- }
1126
+ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1127
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
1128
+ }
1843
1129
 
1844
- ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
1845
- ctx.stream()));
1846
- } else {
1847
- ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
1848
- &workspaceSize, &executor));
1849
- if (workspaceSize > 0) {
1850
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1851
- workspaceAddr = workspace_allocator.get();
1852
- }
1130
+ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1131
+ aclTensor* acl_dst) {
1132
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
1133
+ }
1853
1134
 
1854
- ACL_CHECK(
1855
- aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
1856
- }
1135
+ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1136
+ aclTensor* acl_dst) {
1137
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
1857
1138
  }
1858
1139
 
1859
1140
  void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
@@ -1902,13 +1183,13 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1902
1183
 
1903
1184
  ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1904
1185
  void* tmp_permute_buffer = permute_allocator.get();
1905
- aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor(
1186
+ aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
1906
1187
  tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1907
1188
  ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1908
1189
  GGML_MAX_DIMS, ACL_FORMAT_ND);
1909
1190
  int64_t permute_dim[] = {0, 1, 3, 2};
1910
1191
  int64_t num_dims = 4;
1911
- aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims);
1192
+ aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims);
1912
1193
 
1913
1194
  // timestep * freq
1914
1195
  int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
@@ -1929,7 +1210,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1929
1210
  tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1930
1211
  ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1931
1212
  ACL_FORMAT_ND);
1932
- aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor);
1213
+ aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor);
1933
1214
 
1934
1215
  // cos
1935
1216
  ggml_cann_pool_alloc cos_allocator(
@@ -1957,17 +1238,13 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1957
1238
  int64_t concat_dim = 3;
1958
1239
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1959
1240
  aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1960
- aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
1961
- aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
1241
+ aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
1242
+ aclnn_concat(ctx, tensor_list, acl_dst, concat_dim);
1962
1243
 
1963
1244
  // release
1964
1245
  // segmentation fault when delete both tensorList and his elements.
1965
- ACL_CHECK(aclDestroyTensorList(tensorList));
1966
- ACL_CHECK(aclDestroyTensor(acl_src));
1967
- ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1968
- ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
1969
- ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
1970
- ACL_CHECK(aclDestroyTensor(acl_dst));
1246
+ ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor,
1247
+ tmp_permute_tensor, tmp_mul_tensor, acl_dst);
1971
1248
  }
1972
1249
 
1973
1250
  /**
@@ -1983,21 +1260,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1983
1260
  static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1984
1261
  aclTensor* acl_dst) {
1985
1262
  auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1986
-
1987
- uint64_t workspaceSize = 0;
1988
- aclOpExecutor* executor;
1989
- void* workspaceAddr = nullptr;
1990
-
1991
- ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(
1992
- acl_dst, acl_scalar, &workspaceSize, &executor));
1993
- if (workspaceSize > 0) {
1994
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1995
- workspaceAddr = workspace_allocator.get();
1996
- }
1997
-
1998
- ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
1999
- ctx.stream()));
2000
- ACL_CHECK(aclDestroyScalar(acl_scalar));
1263
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
1264
+ ggml_cann_release_resources(ctx, acl_scalar);
2001
1265
  }
2002
1266
 
2003
1267
  /**
@@ -2018,19 +1282,7 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
2018
1282
  */
2019
1283
  static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
2020
1284
  aclTensor* acl_dst, aclTensor* acl_exp) {
2021
- uint64_t workspaceSize = 0;
2022
- aclOpExecutor* executor;
2023
- void* workspaceAddr = nullptr;
2024
-
2025
- ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(
2026
- acl_dst, acl_exp, &workspaceSize, &executor));
2027
- if (workspaceSize > 0) {
2028
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2029
- workspaceAddr = workspace_allocator.get();
2030
- }
2031
-
2032
- ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
2033
- executor, ctx.stream()));
1285
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
2034
1286
  }
2035
1287
 
2036
1288
  /**
@@ -2182,56 +1434,15 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2182
1434
 
2183
1435
  // add
2184
1436
  aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
2185
-
2186
- ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
2187
- ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
2188
- ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
2189
- ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
2190
- ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
2191
- ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
2192
- ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
2193
- ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
1437
+ ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
1438
+ tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
1439
+ tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
2194
1440
  }
2195
1441
 
2196
1442
  void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2197
1443
  ggml_cann_dup(ctx, dst);
2198
1444
  }
2199
1445
 
2200
- /**
2201
- * @brief Performs element-wise addition of two tensors in place.
2202
- *
2203
- * This function adds the source tensor `acl_src` to the destination tensor
2204
- * `acl_dst` element-wise and stores the result in the destination tensor
2205
- * `acl_dst`.
2206
- *
2207
- * @param ctx The context for the CANN backend operations.
2208
- * @param acl_src The source tensor to be added.
2209
- * @param acl_dst The destination tensor which will hold the result of the
2210
- * addition.
2211
- */
2212
- static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
2213
- aclTensor* acl_src, aclTensor* acl_dst) {
2214
- aclScalar* alpha = nullptr;
2215
- float alphaValue = 1.0f;
2216
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2217
-
2218
- uint64_t workspaceSize = 0;
2219
- aclOpExecutor* executor;
2220
- void* workspaceAddr = nullptr;
2221
-
2222
- ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
2223
- &workspaceSize, &executor));
2224
- if (workspaceSize > 0) {
2225
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2226
- workspaceAddr = workspace_allocator.get();
2227
- }
2228
-
2229
- ACL_CHECK(
2230
- aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
2231
-
2232
- ACL_CHECK(aclDestroyScalar(alpha));
2233
- }
2234
-
2235
1446
  /**
2236
1447
  * @brief Applies the softmax function to a tensor along a specified dimension.
2237
1448
  *
@@ -2248,20 +1459,7 @@ static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
2248
1459
  */
2249
1460
  static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2250
1461
  int64_t dim, aclTensor* acl_dst) {
2251
- uint64_t workspaceSize = 0;
2252
- aclOpExecutor* executor;
2253
- void* workspaceAddr = nullptr;
2254
-
2255
- ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst,
2256
- &workspaceSize, &executor));
2257
-
2258
- if (workspaceSize > 0) {
2259
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2260
- workspaceAddr = workspace_allocator.get();
2261
- }
2262
-
2263
- aclrtStream stream = ctx.stream();
2264
- ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
1462
+ GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
2265
1463
  }
2266
1464
 
2267
1465
  void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -2311,8 +1509,7 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2311
1509
  src1_fp32_nb, GGML_MAX_DIMS);
2312
1510
  aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
2313
1511
  aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
2314
-
2315
- ACL_CHECK(aclDestroyTensor(acl_src1));
1512
+ ggml_cann_release_resources(ctx, acl_src1);
2316
1513
  } else {
2317
1514
  acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
2318
1515
  }
@@ -2365,98 +1562,158 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2365
1562
 
2366
1563
  // softmax
2367
1564
  aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
2368
- ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
1565
+ ggml_cann_release_resources(ctx, alibi_output_tensor);
2369
1566
  } else {
2370
1567
  aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
2371
1568
  }
2372
1569
 
2373
- ACL_CHECK(aclDestroyTensor(acl_src0));
2374
- ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
2375
- ACL_CHECK(aclDestroyTensor(acl_dst));
2376
- ACL_CHECK(aclDestroyScalar(acl_scale));
2377
- ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
2378
- ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
1570
+ ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
1571
+ acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
2379
1572
  }
2380
1573
 
2381
- void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2382
- ggml_tensor* src0 = dst->src[0];
2383
- ggml_tensor* src1 = dst->src[1];
1574
+ /**
1575
+ * @brief Performs embedding operation on a 4D tensor using the CANN backend.
1576
+ *
1577
+ * This function extracts slices from the source tensor (`src_buffer`),
1578
+ * index tensor (`index`), and destination tensor (`dst`), and performs an
1579
+ * embedding operation on them. The embedding operation is applied by iterating
1580
+ * over the last two dimensions of the source tensor, creating the necessary
1581
+ * tensors for the source, index, and output, and executing the embedding operation.
1582
+ *
1583
+ * @param ctx The context for CANN backend operations.
1584
+ * @param src_buffer The source buffer holding the data for the source tensor.
1585
+ * @param src_ne The dimensions of the source tensor.
1586
+ * @param src_nb The strides (byte offsets) of the source tensor.
1587
+ * @param index The index tensor used in the embedding operation.
1588
+ * @param dst The destination tensor where the result will be stored.
1589
+ */
1590
+ static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
1591
+ int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
1592
+ ggml_tensor* dst) {
1593
+ for (int64_t i = 0; i < src_ne[3]; i++) {
1594
+ for (int64_t j = 0; j < src_ne[2]; j++) {
1595
+ // src
1596
+ int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
1597
+ size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
1598
+ aclTensor* acl_src_tensor = ggml_cann_create_tensor(
1599
+ (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
1600
+ ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
1601
+ acl_src_ne, acl_src_nb, 2);
1602
+
1603
+ // index
1604
+ int64_t acl_index_ne[1] = {index->ne[0]};
1605
+ size_t acl_index_nb[1] = {index->nb[0]};
1606
+ aclTensor* acl_index = ggml_cann_create_tensor(
1607
+ (char*)index->data + i * index->nb[2] + j * index->nb[1],
1608
+ ggml_cann_type_mapping(index->type), ggml_element_size(index),
1609
+ acl_index_ne, acl_index_nb, 1);
1610
+
1611
+ // out
1612
+ int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
1613
+ size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
1614
+ aclTensor* acl_out = ggml_cann_create_tensor(
1615
+ (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
1616
+ ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
1617
+ acl_out_ne, acl_out_nb, 2);
1618
+ GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
1619
+ ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
1620
+ }
1621
+ }
1622
+ }
2384
1623
 
2385
- ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2386
- ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2387
- ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2388
- src0->extra = src0_extra_allocator.get();
2389
- src1->extra = src1_extra_allocator.get();
2390
- dst->extra = dst_extra_allocator.get();
2391
- ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
2392
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2393
- ctx.stream()));
2394
- ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
2395
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2396
- ctx.stream()));
2397
- ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
2398
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2399
- ctx.stream()));
1624
+ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1625
+ ggml_tensor* src0 = dst->src[0]; // src
1626
+ ggml_tensor* src1 = dst->src[1]; // index
2400
1627
 
2401
1628
  switch (src0->type) {
2402
1629
  case GGML_TYPE_F32: {
2403
- #ifdef ASCEND_310P
2404
- // Special operation for get_row_f32 kernel of 310P: clear the
2405
- // content of dest data buffer when row is not aligned to 32 bytes
2406
- if ((src0->ne[0] % 8) != 0) {
2407
- size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
2408
- src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2409
- ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2410
- }
2411
- #endif
2412
- aclrtlaunch_ascendc_get_row_f32(
2413
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2414
- ((ggml_tensor*)src0->extra)->ne,
2415
- ((ggml_tensor*)src0->extra)->nb,
2416
- ((ggml_tensor*)src1->extra)->ne,
2417
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2418
- ((ggml_tensor*)dst->extra)->nb);
1630
+ aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
1631
+ dst);
2419
1632
  break;
2420
1633
  }
2421
1634
  case GGML_TYPE_F16: {
2422
- #ifdef ASCEND_310P
2423
- // Special operation for get_row_f16 kernel of 310P: clear the
2424
- // content of dest data buffer when row is not aligned to 32 bytes
2425
- if ((src0->ne[0] % 16) != 0) {
2426
- size_t dst_len =
2427
- src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
2428
- ggml_type_size(
2429
- GGML_TYPE_F32); // out is also f32, even input is f16
2430
- ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
1635
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
1636
+ ggml_cann_pool_alloc src_buffer_allocator(
1637
+ ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
1638
+ void* src_trans_buffer = src_buffer_allocator.get();
1639
+ size_t src_trans_nb[GGML_MAX_DIMS];
1640
+ src_trans_nb[0] = sizeof(float_t);
1641
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1642
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
2431
1643
  }
2432
- #endif
2433
- aclrtlaunch_ascendc_get_row_f16(
2434
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2435
- ((ggml_tensor*)src0->extra)->ne,
2436
- ((ggml_tensor*)src0->extra)->nb,
2437
- ((ggml_tensor*)src1->extra)->ne,
2438
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2439
- ((ggml_tensor*)dst->extra)->nb);
1644
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
1645
+ src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
1646
+ src0->ne, src_trans_nb, GGML_MAX_DIMS);
1647
+ aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
1648
+ aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
1649
+ src_trans_nb, src1, dst);
1650
+ ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
2440
1651
  break;
2441
1652
  }
2442
- case GGML_TYPE_Q4_0:
2443
- aclrtlaunch_ascendc_get_row_q4_0(
2444
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2445
- ((ggml_tensor*)src0->extra)->ne,
2446
- ((ggml_tensor*)src1->extra)->ne,
2447
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2448
- ((ggml_tensor*)dst->extra)->nb);
2449
- break;
2450
- case GGML_TYPE_Q8_0:
2451
- aclrtlaunch_ascendc_get_row_q8_0(
2452
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2453
- ((ggml_tensor*)src0->extra)->ne,
2454
- ((ggml_tensor*)src1->extra)->ne,
2455
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2456
- ((ggml_tensor*)dst->extra)->nb);
1653
+ case GGML_TYPE_Q8_0: {
1654
+ // add 1 dim for bcast mul.
1655
+ size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
1656
+ dequant_nb[GGML_MAX_DIMS + 1];
1657
+ int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
1658
+ *dequant_ne;
1659
+ int64_t scale_offset = 0;
1660
+
1661
+ // [3,4,5,64] -> [3,4,5,2,32]
1662
+ weight_ne[0] = QK8_0;
1663
+ weight_ne[1] = src0->ne[0] / QK8_0;
1664
+ weight_nb[0] = sizeof(int8_t);
1665
+ weight_nb[1] = weight_nb[0] * weight_ne[0];
1666
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1667
+ weight_ne[i] = src0->ne[i - 1];
1668
+ weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
1669
+ }
1670
+
1671
+ // [3,4,5,64] -> [3,4,5,2,1]
1672
+ scale_ne[0] = 1;
1673
+ scale_ne[1] = src0->ne[0] / QK8_0;
1674
+ scale_nb[0] = sizeof(uint16_t);
1675
+ scale_nb[1] = scale_nb[0] * scale_ne[0];
1676
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1677
+ scale_ne[i] = src0->ne[i - 1];
1678
+ scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
1679
+ }
1680
+
1681
+ // [3,4,5,64] -> [3,4,5,2,32]
1682
+ dequant_ne = weight_ne;
1683
+ dequant_nb[0] = sizeof(float_t);
1684
+ for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
1685
+ dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
1686
+ }
1687
+
1688
+ scale_offset = ggml_nelements(src0) * sizeof(int8_t);
1689
+ ggml_cann_pool_alloc dequant_buffer_allocator(
1690
+ ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
1691
+
1692
+ aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
1693
+ src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
1694
+ GGML_MAX_DIMS + 1);
1695
+ aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1696
+ src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
1697
+ GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1698
+ aclTensor* dequant_tensor = ggml_cann_create_tensor(
1699
+ dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
1700
+ dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
1701
+
1702
+ aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
1703
+ dequant_nb[0] = sizeof(float_t);
1704
+ dequant_ne = src0->ne;
1705
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1706
+ dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
1707
+ }
1708
+
1709
+ aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
1710
+ dequant_ne, dequant_nb, src1, dst);
1711
+
1712
+ ggml_cann_release_resources(ctx, dequant_tensor);
2457
1713
  break;
1714
+ }
2458
1715
  default:
2459
- GGML_ABORT("fatal error");
1716
+ GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
2460
1717
  break;
2461
1718
  }
2462
1719
  }
@@ -2480,133 +1737,8 @@ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
2480
1737
  aclTensor* acl_src, aclTensor* acl_dst,
2481
1738
  int64_t dim, int64_t repeats,
2482
1739
  int64_t output_size) {
2483
- uint64_t workspaceSize = 0;
2484
- aclOpExecutor* executor;
2485
- void* workspaceAddr = nullptr;
2486
-
2487
- ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(
2488
- acl_src, repeats, dim, output_size, acl_dst, &workspaceSize,
2489
- &executor));
2490
- if (workspaceSize > 0) {
2491
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2492
- workspaceAddr = workspace_allocator.get();
2493
- }
2494
-
2495
- ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize,
2496
- executor, ctx.stream()));
2497
- }
2498
-
2499
- /**
2500
- * @brief Performs matrix multiplication of two tensors.
2501
- *
2502
- * This function computes the matrix multiplication of the input tensor
2503
- * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2504
- * destination tensor `acl_dst`.
2505
- * The operation is defined as:
2506
- * \f[
2507
- * \text {acl_dst}=\text {acl_input@acl_weight}
2508
- * \f]
2509
- *
2510
- * @param ctx The context for the CANN backend operations.
2511
- * @param acl_input The input tensor for the matrix multiplication.
2512
- * @param acl_weight The weight tensor for the matrix multiplication.
2513
- * @param acl_dst The destination tensor where the result of the matrix
2514
- * multiplication will be stored.
2515
- */
2516
- static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2517
- aclTensor* acl_weight, aclTensor* acl_dst) {
2518
- int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
2519
- // fp32, atlas a2 will transpose it to HFLOAT32.
2520
- uint64_t workspaceSize = 0;
2521
- aclOpExecutor* executor;
2522
- void* workspaceAddr = nullptr;
2523
-
2524
- ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2525
- cube_math_type, &workspaceSize,
2526
- &executor));
2527
-
2528
- if (workspaceSize > 0) {
2529
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2530
- workspaceAddr = workspace_allocator.get();
2531
- }
2532
-
2533
- ACL_CHECK(
2534
- aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2535
- }
2536
-
2537
- /**
2538
- * @brief Performs matrix multiplication of two 2D tensors.
2539
- *
2540
- * This function computes the matrix multiplication of the input tensor
2541
- * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2542
- * destination tensor `acl_dst`.
2543
- * The operation is defined as:
2544
- * \f[
2545
- * \text {acl_dst}=\text {acl_input@acl_weight}
2546
- * \f]
2547
- *
2548
- * @param ctx The context for the CANN backend operations.
2549
- * @param acl_input The input tensor for the matrix multiplication.
2550
- * @param acl_weight The weight tensor for the matrix multiplication.
2551
- * @param acl_dst The destination tensor where the result of the matrix
2552
- * multiplication will be stored.
2553
- */
2554
- static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
2555
- aclTensor* acl_input, aclTensor* acl_weight,
2556
- aclTensor* acl_dst) {
2557
- int8_t cube_math_type = 2;
2558
- uint64_t workspaceSize = 0;
2559
- aclOpExecutor* executor;
2560
- void* workspaceAddr = nullptr;
2561
-
2562
- ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2563
- cube_math_type, &workspaceSize,
2564
- &executor));
2565
-
2566
- if (workspaceSize > 0) {
2567
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2568
- workspaceAddr = workspace_allocator.get();
2569
- }
2570
-
2571
- ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
2572
- }
2573
-
2574
- /**
2575
- * @brief Performs matrix multiplication of two 3D tensors.
2576
- *
2577
- * This function computes the matrix multiplication of the input tensor
2578
- * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2579
- * destination tensor `acl_dst`.
2580
- * The operation is defined as:
2581
- * \f[
2582
- * \text {acl_dst}=\text {acl_input@acl_weight}
2583
- * \f]
2584
- *
2585
- * @param ctx The context for the CANN backend operations.
2586
- * @param acl_input The input tensor for the matrix multiplication.
2587
- * @param acl_weight The weight tensor for the matrix multiplication.
2588
- * @param acl_dst The destination tensor where the result of the matrix
2589
- * multiplication will be stored.
2590
- */
2591
- static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
2592
- aclTensor* acl_input, aclTensor* acl_weight,
2593
- aclTensor* acl_dst) {
2594
- int8_t cube_math_type = 2;
2595
- uint64_t workspaceSize = 0;
2596
- aclOpExecutor* executor;
2597
- void* workspaceAddr = nullptr;
2598
-
2599
- ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2600
- cube_math_type, &workspaceSize,
2601
- &executor));
2602
-
2603
- if (workspaceSize > 0) {
2604
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2605
- workspaceAddr = workspace_allocator.get();
2606
- }
2607
-
2608
- ACL_CHECK(
2609
- aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1740
+ GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim,
1741
+ output_size, acl_dst);
2610
1742
  }
2611
1743
 
2612
1744
  /**
@@ -2654,19 +1786,19 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
2654
1786
 
2655
1787
  switch (n_dims) {
2656
1788
  case 2:
2657
- aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
1789
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
2658
1790
  break;
2659
1791
  case 3:
2660
- aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
1792
+ GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
2661
1793
  break;
2662
1794
  default:
2663
- aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
1795
+ // ALLOW_FP32_DOWN_PRECISION, when input is
1796
+ // fp32, atlas a2 will transpose it to HFLOAT32.
1797
+ GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor, acl_weight_tensor, acl_dst, 1);
2664
1798
  break;
2665
1799
  }
2666
1800
 
2667
- ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2668
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2669
- ACL_CHECK(aclDestroyTensor(acl_dst));
1801
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_input_tensor, acl_dst);
2670
1802
  }
2671
1803
 
2672
1804
  /**
@@ -2736,9 +1868,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2736
1868
  input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
2737
1869
  input_cast_nb, GGML_MAX_DIMS);
2738
1870
  aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
2739
-
2740
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2741
- ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
1871
+ ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor);
2742
1872
  }
2743
1873
 
2744
1874
  // output
@@ -2753,9 +1883,6 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2753
1883
  int64_t max_elem_size = 65535;
2754
1884
  int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
2755
1885
  ggml_cann_pool_alloc workspace_allocator(ctx.pool());
2756
- aclOpExecutor* executor = nullptr;
2757
- uint64_t workspaceSize = 0;
2758
- void* workspaceAddr = nullptr;
2759
1886
  for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2760
1887
  for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
2761
1888
  int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
@@ -2794,20 +1921,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2794
1921
  if (src0->ne[0] > QK8_0) {
2795
1922
  antiquantGroupSize = QK8_0;
2796
1923
  }
2797
-
2798
- ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2799
- acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2800
- nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
2801
- &workspaceSize, &executor));
2802
- if (workspaceAddr == nullptr) {
2803
- workspaceAddr = workspace_allocator.alloc(workspaceSize);
2804
- }
2805
- ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2806
- workspaceAddr, workspaceSize, executor, ctx.stream()));
2807
-
2808
- ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2809
- ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2810
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
1924
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
1925
+ acl_weight_tensor, acl_scale_tensor, nullptr,
1926
+ nullptr, nullptr, nullptr, antiquantGroupSize,
1927
+ acl_output_tensor);
1928
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
2811
1929
 
2812
1930
  // other splits
2813
1931
  for (int64_t split = 1; split < split_size; split++) {
@@ -2834,20 +1952,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2834
1952
  (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2835
1953
  output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
2836
1954
  output_ne_offset);
2837
-
2838
- ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2839
- acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2840
- nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
2841
- acl_output_tensor, &workspaceSize, &executor));
2842
- ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2843
- workspaceAddr, workspaceSize, executor, ctx.stream()));
2844
-
2845
- ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2846
- ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2847
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
1955
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
1956
+ acl_weight_tensor, acl_scale_tensor, nullptr,
1957
+ nullptr, nullptr, nullptr, antiquantGroupSize,
1958
+ acl_output_tensor);
1959
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
2848
1960
  }
2849
1961
 
2850
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
1962
+ ggml_cann_release_resources(ctx, acl_input_tensor);
2851
1963
  }
2852
1964
  }
2853
1965
 
@@ -2864,11 +1976,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2864
1976
  output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
2865
1977
  output_cast_nb, GGML_MAX_DIMS);
2866
1978
  aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2867
- aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
2868
- ggml_cann_type_mapping(dst->type));
1979
+ aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
2869
1980
 
2870
- ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2871
- ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
1981
+ ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor);
2872
1982
  }
2873
1983
  }
2874
1984
 
@@ -2884,7 +1994,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2884
1994
  ggml_cann_mul_mat_quant(ctx, dst, type);
2885
1995
  break;
2886
1996
  default:
2887
- GGML_ABORT("fatal error");
1997
+ GGML_ABORT("Unsupported type for mul_mat");
2888
1998
  break;
2889
1999
  }
2890
2000
  }
@@ -2909,22 +2019,8 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2909
2019
  aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2910
2020
  aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2911
2021
  aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2912
-
2913
- uint64_t workspaceSize = 0;
2914
- aclOpExecutor* executor;
2915
- void* workspaceAddr = nullptr;
2916
-
2917
- ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst,
2918
- &workspaceSize, &executor));
2919
- if (workspaceSize > 0) {
2920
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2921
- workspaceAddr = workspace_allocator.get();
2922
- }
2923
-
2924
- ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream()));
2925
-
2926
- ACL_CHECK(aclDestroyIntArray(acl_shifts));
2927
- ACL_CHECK(aclDestroyIntArray(acl_dims));
2022
+ GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts, acl_dims, acl_dst);
2023
+ ggml_cann_release_resources(ctx, acl_shifts, acl_dims);
2928
2024
  }
2929
2025
 
2930
2026
  /**
@@ -2946,23 +2042,8 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2946
2042
  float value) {
2947
2043
  aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2948
2044
  aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2949
-
2950
- uint64_t workspaceSize = 0;
2951
- aclOpExecutor* executor;
2952
- void* workspaceAddr = nullptr;
2953
-
2954
- ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(
2955
- acl_src, dim, acl_index, acl_value, &workspaceSize, &executor));
2956
- if (workspaceSize > 0) {
2957
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2958
- workspaceAddr = workspace_allocator.get();
2959
- }
2960
-
2961
- ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize,
2962
- executor, ctx.stream()));
2963
-
2964
- ACL_CHECK(aclDestroyIntArray(acl_index));
2965
- ACL_CHECK(aclDestroyScalar(acl_value));
2045
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value);
2046
+ ggml_cann_release_resources(ctx, acl_index, acl_value);
2966
2047
  }
2967
2048
 
2968
2049
  static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
@@ -2977,37 +2058,30 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2977
2058
  ggml_tensor* src1 = dst->src[1]; // position
2978
2059
  ggml_tensor* src2 = dst->src[2]; // freq_factors
2979
2060
 
2980
- // arange, [0,1,...,ne0/2]
2981
- int64_t arange_length = src0->ne[0] / 2;
2982
- ggml_cann_pool_alloc arange_allocator(ctx.pool(),
2983
- arange_length * sizeof(float_t));
2984
- void* arange_buffer = arange_allocator.get();
2985
- int64_t arange_ne[] = {arange_length, 1, 1, 1};
2986
- size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
2987
- arange_length * sizeof(float_t)};
2988
-
2989
- aclTensor* acl_arange_tensor =
2990
- ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t),
2991
- arange_ne, arange_nb, GGML_MAX_DIMS);
2061
+ GGML_TENSOR_BINARY_OP_LOCALS
2062
+
2063
+ // theta_scale arange, [0,1,...,ne00/2 - 1]
2064
+ int64_t theta_scale_length = ne00 / 2;
2065
+ ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
2066
+ theta_scale_length * sizeof(float_t));
2067
+ void* theta_scale_buffer = theta_scale_allocator.get();
2068
+ int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
2069
+ size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
2070
+ theta_scale_length * sizeof(float_t)};
2071
+
2072
+ aclTensor* acl_theta_scale_tensor =
2073
+ ggml_cann_create_tensor(theta_scale_buffer, ACL_FLOAT, sizeof(float_t),
2074
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2992
2075
  float start = 0;
2993
2076
  float step = 1;
2994
- float stop = src0->ne[0] / 2;
2995
- float n_elements = src0->ne[0] / 2;
2996
- aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements);
2077
+ float stop = ne00 / 2;
2078
+ float n_elements = ne00 / 2;
2079
+ aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
2997
2080
 
2998
2081
  // power
2999
- // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
3000
- // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
3001
- // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
3002
- // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
3003
- // acl_power_tensor);
3004
- ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
3005
- arange_length * sizeof(float_t));
3006
- void* theta_scale_buffer = theta_scale_allocator.get();
3007
- aclTensor* acl_theta_scale_tensor = aclnn_values(
3008
- ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
3009
- GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
3010
- aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
2082
+ aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2083
+ GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
2084
+ acl_theta_scale_tensor);
3011
2085
 
3012
2086
  // freq_scale
3013
2087
  if (freq_scale != 1) {
@@ -3018,29 +2092,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
3018
2092
  if (src2) {
3019
2093
  aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
3020
2094
  src2->data, ggml_cann_type_mapping(src2->type),
3021
- ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
3022
- aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
3023
- nullptr, true);
3024
- ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
2095
+ ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2096
+ aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
2097
+ ggml_cann_release_resources(ctx, acl_freq_factors_tensor);
3025
2098
  }
3026
2099
 
3027
2100
  // position
3028
2101
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
3029
2102
  int64_t position_length = src1->ne[0];
3030
- int64_t position_ne[] = {1, position_length, 1, 1};
3031
- size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t),
3032
- sizeof(int32_t) * position_length,
2103
+ int64_t position_ne[] = {1, 1, position_length, 1};
2104
+ size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t),
3033
2105
  sizeof(int32_t) * position_length};
3034
2106
  aclTensor* acl_position_tensor = ggml_cann_create_tensor(
3035
2107
  src1->data, ggml_cann_type_mapping(src1->type),
3036
2108
  ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
3037
2109
 
3038
2110
  // power * position
3039
- int64_t theta_length = arange_length * position_length;
2111
+ int64_t theta_length = theta_scale_length * position_length;
3040
2112
  ggml_cann_pool_alloc theta_allocator(ctx.pool(),
3041
2113
  theta_length * sizeof(float_t));
3042
2114
  void* theta_buffer = theta_allocator.get();
3043
- int64_t theta_ne[] = {arange_length, position_length, 1, 1};
2115
+ int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
3044
2116
  size_t theta_nb[GGML_MAX_DIMS];
3045
2117
  theta_nb[0] = sizeof(float_t);
3046
2118
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -3052,40 +2124,22 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
3052
2124
  aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
3053
2125
  acl_theta_tensor);
3054
2126
 
3055
- // permute: [0,1,2,3]->[0,2,1,3]
3056
- int64_t permute_ne[] = {arange_length, 1, position_length, 1};
3057
- size_t permute_nb[GGML_MAX_DIMS];
3058
- permute_nb[0] = sizeof(float_t);
3059
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
3060
- permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1];
3061
- }
3062
- ggml_cann_pool_alloc permute_allocator(ctx.pool(),
3063
- theta_length * sizeof(float_t));
3064
- void* permute_buffer = permute_allocator.get();
3065
- aclTensor* acl_permute_tensor = ggml_cann_create_tensor(
3066
- permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
3067
- GGML_MAX_DIMS, ACL_FORMAT_ND);
3068
- int64_t permute_dim[] = {0, 2, 1, 3};
3069
- int64_t num_dims = 4;
3070
- aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
3071
- num_dims);
3072
-
3073
2127
  // sin/cos
3074
2128
  ggml_cann_pool_alloc sin_allocator(ctx.pool(),
3075
2129
  theta_length * sizeof(float_t));
3076
2130
  void* sin_buffer = sin_allocator.get();
3077
2131
  aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
3078
- sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
2132
+ sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
3079
2133
  GGML_MAX_DIMS, ACL_FORMAT_ND);
3080
- aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor);
2134
+ aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
3081
2135
 
3082
2136
  ggml_cann_pool_alloc cos_allocator(ctx.pool(),
3083
2137
  theta_length * sizeof(float_t));
3084
2138
  void* cos_buffer = cos_allocator.get();
3085
2139
  aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
3086
- cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
2140
+ cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
3087
2141
  GGML_MAX_DIMS, ACL_FORMAT_ND);
3088
- aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
2142
+ aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
3089
2143
 
3090
2144
  // attn_factor
3091
2145
  if (attn_factor != 1) {
@@ -3101,7 +2155,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
3101
2155
  } else {
3102
2156
  int64_t num_repeats = 2;
3103
2157
  int64_t dim = 3;
3104
- int64_t output_size = arange_length * num_repeats;
2158
+ int64_t output_size = theta_scale_length * num_repeats;
3105
2159
  aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
3106
2160
  num_repeats, output_size);
3107
2161
  aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
@@ -3109,13 +2163,8 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
3109
2163
  }
3110
2164
 
3111
2165
  // release
3112
- ACL_CHECK(aclDestroyTensor(acl_arange_tensor));
3113
- ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor));
3114
- ACL_CHECK(aclDestroyTensor(acl_position_tensor));
3115
- ACL_CHECK(aclDestroyTensor(acl_theta_tensor));
3116
- ACL_CHECK(aclDestroyTensor(acl_permute_tensor));
3117
- ACL_CHECK(aclDestroyTensor(acl_sin_tensor));
3118
- ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
2166
+ ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
2167
+ acl_theta_tensor, acl_sin_tensor, acl_cos_tensor, acl_theta_scale);
3119
2168
  }
3120
2169
 
3121
2170
  #ifdef __cplusplus
@@ -3137,7 +2186,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3137
2186
  // TODO: use ascendc
3138
2187
  // Only test with LLAMA model.
3139
2188
  ggml_tensor* src0 = dst->src[0]; // input
3140
- ggml_tensor* src2 = dst->src[2]; // freq_factors
3141
2189
 
3142
2190
  // param
3143
2191
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -3172,13 +2220,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3172
2220
 
3173
2221
  // init cos/sin cache
3174
2222
  ggml_cann_pool_alloc sin_allocator(
3175
- ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
2223
+ ctx.pool(), ne00 * ne02 * sizeof(float_t));
3176
2224
  ggml_cann_pool_alloc cos_allocator(
3177
- ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
2225
+ ctx.pool(), ne00 * ne02 * sizeof(float_t));
3178
2226
  void* sin_buffer = sin_allocator.get();
3179
2227
  void* cos_buffer = cos_allocator.get();
3180
2228
 
3181
- int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
2229
+ int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
3182
2230
  size_t sin_reshape_nb[GGML_MAX_DIMS];
3183
2231
  sin_reshape_nb[0] = sizeof(float_t);
3184
2232
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -3191,7 +2239,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3191
2239
  ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
3192
2240
  sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
3193
2241
  aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
3194
- theta_scale, freq_scale, attn_factor, is_neox);
2242
+ theta_scale, freq_scale, attn_factor, is_neox);
3195
2243
 
3196
2244
  aclTensor* acl_src = ggml_cann_create_tensor(src0);
3197
2245
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
@@ -3228,8 +2276,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3228
2276
  int64_t shifts[] = {1};
3229
2277
  int64_t dims[] = {3};
3230
2278
  aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
3231
- ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
3232
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2279
+ ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
3233
2280
 
3234
2281
  // init [-1, 1, -1, 1, ...]
3235
2282
  minus_one_scale_buffer = minus_one_scale_allocator.get();
@@ -3265,8 +2312,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3265
2312
  int64_t dims[] = {3};
3266
2313
  aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
3267
2314
 
3268
- ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
3269
- ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2315
+ ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
3270
2316
  // init [-1, -1, -1, 1, 1,1,...]
3271
2317
  minus_one_scale_buffer = minus_one_scale_allocator.get();
3272
2318
  int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
@@ -3291,7 +2337,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3291
2337
  bool inplace = true;
3292
2338
  float scale = -1;
3293
2339
  aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
3294
- ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
2340
+ ggml_cann_release_resources(ctx, acl_first_half_tensor);
3295
2341
  }
3296
2342
 
3297
2343
  // TODO: n_dims < ne0
@@ -3319,8 +2365,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3319
2365
  // output
3320
2366
  void* output_fp32_buffer;
3321
2367
  if (src0->type == GGML_TYPE_F32) {
3322
- aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
3323
- aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
2368
+ aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor);
2369
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor,
3324
2370
  acl_sin_reshape_tensor);
3325
2371
  aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
3326
2372
  // TODO: ne0 != n_dims in mode2
@@ -3356,76 +2402,188 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3356
2402
  output_fp32_tensor);
3357
2403
  aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
3358
2404
 
3359
- ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
3360
- ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
3361
- ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
3362
- ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
3363
- ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
3364
- ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
3365
- ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
3366
- ACL_CHECK(aclDestroyTensor(acl_src));
2405
+ ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2,
2406
+ output_fp32_tensor, acl_sin_reshape_tensor,
2407
+ acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
2408
+ acl_input_roll_reshape_tensor, acl_src);
3367
2409
  }
3368
2410
  return;
3369
2411
  #endif
3370
2412
 
3371
- // src0 == GGML_TYPE_F16
3372
- // TODO: optimization this `if` code
3373
- if (src0->type == GGML_TYPE_F16) {
3374
- ggml_cann_pool_alloc sin_final_allocator(
3375
- ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
3376
- ggml_cann_pool_alloc cos_final_allocator(
3377
- ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
3378
- void* sin_final_buffer = sin_final_allocator.get();
3379
- void* cos_final_buffer = cos_final_allocator.get();
3380
-
3381
- int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
3382
- size_t sin_final_nb[GGML_MAX_DIMS];
3383
- sin_final_nb[0] = ggml_type_size(src0->type);
3384
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
3385
- sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
2413
+ // ggml_mode = 0 --> aclnn_model = 1
2414
+ int64_t acl_mode = mode == 0 ? 1 : mode;
2415
+
2416
+ switch (src0->type) {
2417
+ case GGML_TYPE_F32: {
2418
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src,
2419
+ acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst);
2420
+ break;
3386
2421
  }
3387
- aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
3388
- sin_final_buffer, ggml_cann_type_mapping(src0->type),
3389
- ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
3390
- GGML_MAX_DIMS);
3391
- aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
3392
- cos_final_buffer, ggml_cann_type_mapping(src0->type),
3393
- ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
3394
- GGML_MAX_DIMS);
2422
+ case GGML_TYPE_F16: {
2423
+ ggml_cann_pool_alloc src_trans_allocator(
2424
+ ctx.pool(), ggml_nelements(src0) * sizeof(float));
2425
+ void* src_trans_buffer = src_trans_allocator.get();
2426
+ ggml_cann_pool_alloc dst_trans_allocator(
2427
+ ctx.pool(), ggml_nelements(dst) * sizeof(float));
2428
+ void* dst_trans_buffer = dst_trans_allocator.get();
2429
+
2430
+ size_t src_trans_nb[GGML_MAX_DIMS];
2431
+ src_trans_nb[0] = sizeof(float);
2432
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2433
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
2434
+ }
3395
2435
 
3396
- aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
3397
- ggml_cann_type_mapping(src0->type));
3398
- aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
3399
- ggml_cann_type_mapping(src0->type));
3400
- ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
3401
- ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
3402
- acl_sin_reshape_tensor = acl_sin_final_tensor;
3403
- acl_cos_reshape_tensor = acl_cos_final_tensor;
3404
- }
2436
+ aclTensor* acl_src_trans_tensor = ggml_cann_create_tensor(
2437
+ src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb,
2438
+ GGML_MAX_DIMS);
2439
+ aclTensor* acl_dst_trans_tensor = ggml_cann_create_tensor(
2440
+ dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb,
2441
+ GGML_MAX_DIMS);
2442
+
2443
+ aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
3405
2444
 
3406
- uint64_t workspaceSize = 0;
3407
- aclOpExecutor* executor;
2445
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor,
2446
+ acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
2447
+ acl_dst_trans_tensor);
3408
2448
 
3409
- void* workspaceAddr = nullptr;
2449
+ aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
3410
2450
 
3411
- int acl_mode = mode;
3412
- if (mode == 0) {
3413
- acl_mode = 1;
2451
+ ggml_cann_release_resources(ctx, acl_src_trans_tensor,
2452
+ acl_dst_trans_tensor);
2453
+ break;
2454
+ }
2455
+ default:
2456
+ GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
2457
+ break;
3414
2458
  }
2459
+ ggml_cann_release_resources(ctx, acl_cos_reshape_tensor,
2460
+ acl_sin_reshape_tensor, acl_src, acl_dst);
2461
+ }
2462
+
2463
+
2464
+ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2465
+ ggml_tensor * src0 = dst->src[0];
2466
+
2467
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
2468
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2469
+
2470
+ GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst);
2471
+
2472
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
2473
+ }
2474
+
2475
+ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2476
+ ggml_tensor * src0 = dst->src[0];
2477
+ ggml_tensor * src1 = dst->src[1];
2478
+
2479
+ // stride
2480
+ int64_t s0 = ((const int32_t*)(dst->op_params))[0];
2481
+
2482
+ aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2483
+ aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
2484
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
2485
+
2486
+ int64_t strideVal[1];
2487
+ strideVal[0] = s0;
2488
+ aclIntArray *stride = aclCreateIntArray(strideVal, 1);
2489
+ int64_t paddingVal[] = {0};
2490
+ aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
2491
+ int64_t dilationVal[] = {1};
2492
+ aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
2493
+ bool transposed = true;
2494
+ int64_t groups = 1;
2495
+ int8_t cubeMathType = 0;
2496
+
2497
+ #ifdef ASCEND_310P
2498
+ cubeMathType = 1;
2499
+ #endif
2500
+
2501
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
2502
+ padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
2503
+
2504
+ ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
2505
+ }
2506
+
2507
+ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2508
+ ggml_tensor * src0 = dst->src[0];
2509
+
2510
+ aclTensor* acl_input = ggml_cann_create_tensor(src0);
2511
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2512
+
2513
+ float alphaValue = 1.0f;
2514
+ aclScalar* alpha = nullptr;
2515
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2516
+
2517
+ GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha,
2518
+ acl_dst);
2519
+
2520
+ ggml_cann_release_resources(ctx, acl_input, acl_dst, alpha);
2521
+ }
2522
+
2523
+ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2524
+ ggml_tensor * src0 = dst->src[0];
2525
+
2526
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
2527
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2528
+
2529
+ int64_t reduceDimValue[] = {3};
2530
+ aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1);
2531
+ bool keepDim = true;
2532
+
2533
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
2534
+
2535
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim);
2536
+ }
2537
+
2538
+ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2539
+ ggml_tensor * src0 = dst->src[0];
2540
+ int32_t *opts = (int32_t *) dst->op_params;
2541
+ int64_t paddingsArray[2] = {opts[0], opts[1]};
2542
+ aclIntArray* paddings = aclCreateIntArray(paddingsArray, 2);
2543
+
2544
+ for (int64_t i = 0; i < src0->ne[3]; i++) {
2545
+ aclTensor* acl_src = ggml_cann_create_tensor(
2546
+ (char*)src0->data + i * src0->ne[3],
2547
+ ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
2548
+ src0->ne, src0->nb, 3);
3415
2549
 
3416
- ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
3417
- acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
3418
- acl_dst, &workspaceSize, &executor));
3419
- if (workspaceSize > 0) {
3420
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
3421
- workspaceAddr = workspace_allocator.get();
2550
+ aclTensor* acl_dst = ggml_cann_create_tensor(
2551
+ (char*)dst->data + i * src0->ne[3],
2552
+ ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2553
+ dst->ne, dst->nb, 3);
2554
+
2555
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst);
2556
+
2557
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
3422
2558
  }
2559
+ ggml_cann_release_resources(ctx, paddings);
2560
+ }
2561
+
2562
+ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2563
+ ggml_tensor * src0 = dst->src[0];
2564
+ ggml_tensor * src1 = dst->src[1];
2565
+
2566
+ aclTensor* acl_self = ggml_cann_create_tensor(src0);
2567
+ aclTensor* acl_other = ggml_cann_create_tensor(src1);
2568
+
2569
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other);
2570
+
2571
+ ggml_cann_sum(ctx, dst);
2572
+
2573
+ ggml_cann_release_resources(ctx, acl_self, acl_other);
2574
+ }
2575
+
2576
+ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2577
+ ggml_tensor * src0 = dst->src[0];
2578
+
2579
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
2580
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2581
+
2582
+ float alphaValue = 0.0f;
2583
+ aclScalar* alpha = nullptr;
2584
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
3423
2585
 
3424
- ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
3425
- executor, ctx.stream()));
2586
+ GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst);
3426
2587
 
3427
- ACL_CHECK(aclDestroyTensor(acl_src));
3428
- ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
3429
- ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
3430
- ACL_CHECK(aclDestroyTensor(acl_dst));
2588
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
3431
2589
  }