whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,3427 @@
1
+ /*
2
+ * Copyright (c) 2023-2024 The ggml authors
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ * of this software and associated documentation files (the "Software"), to
6
+ * deal in the Software without restriction, including without limitation the
7
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8
+ * sell copies of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in
12
+ * all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20
+ * IN THE SOFTWARE.
21
+ */
22
+
23
+ #include "aclnn_ops.h"
24
+
25
+ #include <aclnnop/aclnn_addcdiv.h>
26
+ #include <aclnnop/aclnn_avgpool2d.h>
27
+ #include <aclnnop/aclnn_batch_matmul.h>
28
+ #include <aclnnop/aclnn_cast.h>
29
+ #include <aclnnop/aclnn_constant_pad_nd.h>
30
+ #include <aclnnop/aclnn_copy.h>
31
+ #include <aclnnop/aclnn_cos.h>
32
+ #include <aclnnop/aclnn_div.h>
33
+ #include <aclnnop/aclnn_exp.h>
34
+ #include <aclnnop/aclnn_fill_scalar.h>
35
+ #include <aclnnop/aclnn_group_norm.h>
36
+ #include <aclnnop/aclnn_index_fill_tensor.h>
37
+ #include <aclnnop/aclnn_layer_norm.h>
38
+ #include <aclnnop/aclnn_matmul.h>
39
+ #include <aclnnop/aclnn_max_pool.h>
40
+ #include <aclnnop/aclnn_mm.h>
41
+ #include <aclnnop/aclnn_permute.h>
42
+ #include <aclnnop/aclnn_pow_tensor_tensor.h>
43
+ #include <aclnnop/aclnn_reduce_sum.h>
44
+ #include <aclnnop/aclnn_repeat.h>
45
+ #include <aclnnop/aclnn_repeat_interleave.h>
46
+ #include <aclnnop/aclnn_roll.h>
47
+ #include <aclnnop/aclnn_sin.h>
48
+ #include <aclnnop/aclnn_softmax.h>
49
+ #include <aclnnop/aclnn_tril.h>
50
+ #include <aclnnop/aclnn_triu.h>
51
+ #include <aclnnop/aclnn_upsample_nearest_2d.h>
52
+ #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
53
+ #include <float.h>
54
+
55
+ #include <cmath>
56
+ #include <cstring>
57
+ #include <exception>
58
+ #include <vector>
59
+
60
+ #include "ggml-impl.h"
61
+ #include "kernels/ascendc_kernels.h"
62
+
63
+ #define GGML_COMMON_DECL_C
64
+
65
+ #include "../ggml-common.h"
66
+
67
+ /**
68
+ * @brief Repeats elements of a tensor along each dimension according to the
69
+ * specified repeat array.
70
+ *
71
+ * @param ctx The context for the CANN backend operations.
72
+ * @param acl_src The source tensor to be repeated.
73
+ * @param acl_dst The destination tensor after repeating.
74
+ * @param repeat_array The array specifying the number of repetitions along each
75
+ * dimension.
76
+ */
77
+ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
78
+ aclTensor* acl_dst, int64_t* repeat_array) {
79
+ // repeat tensor along each dim with repeat_array
80
+ aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
81
+
82
+ uint64_t workspaceSize = 0;
83
+ aclOpExecutor* executor;
84
+ void* workspaceAddr = nullptr;
85
+
86
+ ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
87
+ &workspaceSize, &executor));
88
+
89
+ if (workspaceSize > 0) {
90
+ // Memory from allocator will "free" immediately, and this memory
91
+ // will be alloced to other pointers, but it won't access before
92
+ // this async task end because all tasks in same stream will execute
93
+ // in queue.
94
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
95
+ workspaceAddr = workspace_allocator.get();
96
+ }
97
+ ACL_CHECK(
98
+ aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream()));
99
+ ACL_CHECK(aclDestroyIntArray(repeats));
100
+ }
101
+
102
+ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
103
+ ggml_tensor* src = dst->src[0];
104
+ GGML_ASSERT(ggml_can_repeat(src, dst));
105
+
106
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
107
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
108
+
109
+ int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
110
+ dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
111
+
112
+ aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
113
+ ACL_CHECK(aclDestroyTensor(acl_src));
114
+ ACL_CHECK(aclDestroyTensor(acl_dst));
115
+ }
116
+
117
+ /**
118
+ * @brief Adds two tensors element-wise and stores the result in a destination
119
+ * tensor.
120
+ *
121
+ * This function performs the operation:
122
+ * \f[
123
+ * dst = acl\_src0 + alpha \times acl\_src1
124
+ * \f]
125
+ * where alpha is a scalar value and defaults to 1.0f.
126
+ *
127
+ * @param ctx The context for the CANN backend operations.
128
+ * @param acl_src0 The first source tensor.
129
+ * @param acl_src1 The second source tensor.
130
+ * @param acl_dst The destination tensor where the result will be stored.
131
+ */
132
+ static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
133
+ aclTensor* acl_src1, aclTensor* acl_dst) {
134
+ aclScalar* alpha = nullptr;
135
+ float alphaValue = 1.0f;
136
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
137
+
138
+ uint64_t workspaceSize = 0;
139
+ aclOpExecutor* executor;
140
+ void* workspaceAddr = nullptr;
141
+
142
+ ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
143
+ &workspaceSize, &executor));
144
+ if (workspaceSize > 0) {
145
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
146
+ workspaceAddr = workspace_allocator.get();
147
+ }
148
+
149
+ ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
150
+
151
+ ACL_CHECK(aclDestroyScalar(alpha));
152
+ }
153
+
154
+ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
155
+ ggml_tensor* src0 = dst->src[0];
156
+ ggml_tensor* src1 = dst->src[1];
157
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
158
+
159
+ aclTensor* acl_src0;
160
+ aclTensor* acl_src1;
161
+ aclTensor* acl_dst;
162
+
163
+ // Need bcast
164
+ if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
165
+ BCAST_SHAPE(src0, src1)
166
+ acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
167
+ acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
168
+ acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
169
+ } else {
170
+ acl_src0 = ggml_cann_create_tensor(src0);
171
+ acl_src1 = ggml_cann_create_tensor(src1);
172
+ acl_dst = ggml_cann_create_tensor(dst);
173
+ }
174
+
175
+ aclnn_add(ctx, acl_src0, acl_src1, acl_dst);
176
+
177
+ ACL_CHECK(aclDestroyTensor(acl_src0));
178
+ ACL_CHECK(aclDestroyTensor(acl_src1));
179
+ ACL_CHECK(aclDestroyTensor(acl_dst));
180
+ }
181
+
182
+ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
183
+ ggml_tensor* src = dst->src[0];
184
+
185
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
186
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
187
+
188
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
189
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
190
+
191
+ float negative_slope;
192
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
193
+ aclScalar* acl_negative_slope =
194
+ aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
195
+
196
+ uint64_t workspaceSize = 0;
197
+ aclOpExecutor* executor;
198
+ void* workspaceAddr = nullptr;
199
+
200
+ ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
201
+ acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
202
+ if (workspaceSize > 0) {
203
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
204
+ workspaceAddr = workspace_allocator.get();
205
+ }
206
+
207
+ ACL_CHECK(
208
+ aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream()));
209
+
210
+ ACL_CHECK(aclDestroyScalar(acl_negative_slope));
211
+ ACL_CHECK(aclDestroyTensor(acl_src));
212
+ ACL_CHECK(aclDestroyTensor(acl_dst));
213
+ }
214
+
215
+ /**
216
+ * @brief Concatenates a list of tensors along a specified dimension and stores
217
+ * the result in a destination tensor.
218
+ *
219
+ * @param ctx The context for the CANN backend operations.
220
+ * @param tensorList The list of tensors to be concatenated.
221
+ * @param acl_dst The destination tensor where the concatenated result will be
222
+ * stored.
223
+ * @param concat_dim The dimension along which the tensors will be concatenated.
224
+ */
225
+ static void aclnn_concat(ggml_backend_cann_context& ctx,
226
+ aclTensorList* tensorList, aclTensor* acl_dst,
227
+ int64_t concat_dim) {
228
+ uint64_t workspaceSize = 0;
229
+ aclOpExecutor* executor;
230
+ void* workspaceAddr = nullptr;
231
+
232
+ ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
233
+ &workspaceSize, &executor));
234
+ if (workspaceSize > 0) {
235
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
236
+ workspaceAddr = workspace_allocator.get();
237
+ }
238
+
239
+ ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream()));
240
+ }
241
+
242
+ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
243
+ ggml_tensor* src0 = dst->src[0];
244
+ ggml_tensor* src1 = dst->src[1];
245
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
246
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
247
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
248
+
249
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
250
+
251
+ GGML_ASSERT(dim >= 0 && dim < 4);
252
+ int32_t acl_dim = 3 - dim;
253
+
254
+ aclTensor* tensors[] = {acl_src0, acl_src1};
255
+ aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
256
+ aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
257
+
258
+ ACL_CHECK(aclDestroyTensorList(tensorList));
259
+ ACL_CHECK(aclDestroyTensor(acl_dst));
260
+ }
261
+
262
+ /**
263
+ * @brief Creates a tensor with values starting from `start`, incremented by
264
+ * `step`, and ending before `stop`.
265
+ *
266
+ * This function performs the operation:
267
+ * \f[
268
+ * \text {out }_{i+1}=\text {out }_i+\text {step}
269
+ * \f]
270
+ * the range is [start, stop).
271
+ *
272
+ * @param ctx The context for the CANN backend operations.
273
+ * @param acl_dst The destination tensor where the values will be stored.
274
+ * @param start The starting value of the range.
275
+ * @param stop The ending value of the range (exclusive).
276
+ * @param step The step size between consecutive values.
277
+ * @param n_elements The number of elements in the destination tensor.
278
+ */
279
+ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
280
+ float start, float stop, float step,
281
+ int64_t n_elements) {
282
+ int64_t steps = (int64_t)std::ceil((stop - start) / step);
283
+ GGML_ASSERT(n_elements == steps);
284
+
285
+ uint64_t workspaceSize = 0;
286
+ aclOpExecutor* executor;
287
+ void* workspaceAddr = nullptr;
288
+
289
+ aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
290
+ aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
291
+ aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
292
+
293
+ ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
294
+ &workspaceSize, &executor));
295
+ if (workspaceSize > 0) {
296
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
297
+ workspaceAddr = workspace_allocator.get();
298
+ }
299
+
300
+ ACL_CHECK(
301
+ aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream()));
302
+
303
+ ACL_CHECK(aclDestroyScalar(acl_start));
304
+ ACL_CHECK(aclDestroyScalar(acl_end));
305
+ ACL_CHECK(aclDestroyScalar(acl_step));
306
+ }
307
+
308
+ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
309
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
310
+
311
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
312
+
313
+ int64_t n_elements = ggml_nelements(dst);
314
+ float start;
315
+ float stop;
316
+ float step;
317
+ memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
318
+ memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
319
+ memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
320
+
321
+ aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
322
+ ACL_CHECK(aclDestroyTensor(acl_dst));
323
+ }
324
+
325
+ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
326
+ dst->src[1] = dst->src[0];
327
+ ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
328
+ }
329
+
330
+ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
331
+ ggml_tensor* src = dst->src[0];
332
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
333
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
334
+
335
+ float min;
336
+ float max;
337
+ memcpy(&min, dst->op_params, sizeof(float));
338
+ memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
339
+
340
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
341
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
342
+
343
+ aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
344
+ aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
345
+
346
+ uint64_t workspaceSize = 0;
347
+ aclOpExecutor* executor;
348
+ void* workspaceAddr = nullptr;
349
+
350
+ ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
351
+ &workspaceSize, &executor));
352
+ if (workspaceSize > 0) {
353
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
354
+ workspaceAddr = workspace_allocator.get();
355
+ }
356
+
357
+ ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream()));
358
+
359
+ ACL_CHECK(aclDestroyScalar(acl_min));
360
+ ACL_CHECK(aclDestroyScalar(acl_max));
361
+ ACL_CHECK(aclDestroyTensor(acl_src));
362
+ ACL_CHECK(aclDestroyTensor(acl_dst));
363
+ }
364
+
365
+ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
366
+ ggml_tensor* src = dst->src[0];
367
+
368
+ // scale factor
369
+ float v;
370
+ memcpy(&v, dst->op_params, sizeof(float));
371
+
372
+ aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
373
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
374
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
375
+
376
+ uint64_t workspaceSize = 0;
377
+ aclOpExecutor* executor;
378
+ void* workspaceAddr = nullptr;
379
+
380
+ ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
381
+ &executor));
382
+ if (workspaceSize > 0) {
383
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
384
+ workspaceAddr = workspace_allocator.get();
385
+ }
386
+
387
+ ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
388
+
389
+ ACL_CHECK(aclDestroyScalar(scale));
390
+ ACL_CHECK(aclDestroyTensor(acl_src));
391
+ ACL_CHECK(aclDestroyTensor(acl_dst));
392
+ }
393
+
394
+ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
395
+ ggml_tensor* src = dst->src[0];
396
+ enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
397
+
398
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
399
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
400
+ ggml_cann_pool_alloc temp_buffer_allocator(
401
+ ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
402
+ void* buffer = temp_buffer_allocator.get();
403
+ aclTensor* tmp_tensor =
404
+ ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
405
+ dst->ne, dst->nb, GGML_MAX_DIMS);
406
+
407
+ uint64_t workspaceSize = 0;
408
+ aclOpExecutor* executor;
409
+ void* workspaceAddr = nullptr;
410
+
411
+ ACL_CHECK(aclnnArgsortGetWorkspaceSize(
412
+ acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
413
+ &workspaceSize, &executor));
414
+ if (workspaceSize > 0) {
415
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
416
+ workspaceAddr = workspace_allocator.get();
417
+ }
418
+
419
+ ACL_CHECK(
420
+ aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream()));
421
+
422
+ workspaceSize = 0;
423
+ ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor,
424
+ ggml_cann_type_mapping(dst->type),
425
+ acl_dst, &workspaceSize, &executor));
426
+ if (workspaceSize > 0) {
427
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
428
+ workspaceAddr = workspace_allocator.get();
429
+ }
430
+
431
+ ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
432
+
433
+ ACL_CHECK(aclDestroyTensor(acl_src));
434
+ ACL_CHECK(aclDestroyTensor(tmp_tensor));
435
+ ACL_CHECK(aclDestroyTensor(acl_dst));
436
+ }
437
+
438
+ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
439
+ ggml_tensor* src = dst->src[0];
440
+
441
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
442
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
443
+
444
+ float eps;
445
+ memcpy(&eps, dst->op_params, sizeof(float));
446
+
447
+ uint64_t workspaceSize = 0;
448
+ aclOpExecutor* executor;
449
+ void* workspaceAddr = nullptr;
450
+
451
+ std::vector<int64_t> normData = {dst->ne[0]};
452
+ aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
453
+ ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
454
+ eps, acl_dst, nullptr, nullptr,
455
+ &workspaceSize, &executor));
456
+
457
+ if (workspaceSize > 0) {
458
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
459
+ workspaceAddr = workspace_allocator.get();
460
+ }
461
+
462
+ ACL_CHECK(
463
+ aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
464
+
465
+ ACL_CHECK(aclDestroyIntArray(norm));
466
+ ACL_CHECK(aclDestroyTensor(acl_src));
467
+ ACL_CHECK(aclDestroyTensor(acl_dst));
468
+ }
469
+
470
+ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
471
+ ggml_tensor* src = dst->src[0];
472
+
473
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
474
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
475
+
476
+ int n_groups = dst->op_params[0];
477
+
478
+ float eps;
479
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
480
+
481
+ uint64_t workspaceSize = 0;
482
+ aclOpExecutor* executor;
483
+ void* workspaceAddr = nullptr;
484
+
485
+ int64_t N = src->ne[3];
486
+ int64_t C = src->ne[2];
487
+ int64_t HxW = src->ne[1] * src->ne[0];
488
+
489
+ size_t type_size = ggml_type_size(src->type);
490
+ int64_t ne[] = {n_groups, N};
491
+ size_t nb[] = {type_size, type_size * n_groups};
492
+ size_t n_bytes = N * n_groups;
493
+
494
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
495
+ void* buffer = temp_buffer_allocator.get();
496
+ aclTensor* acl_mean_out = ggml_cann_create_tensor(
497
+ buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
498
+ aclTensor* acl_rstd_out = ggml_cann_create_tensor(
499
+ (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
500
+
501
+ ACL_CHECK(aclnnGroupNormGetWorkspaceSize(
502
+ acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst,
503
+ acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
504
+
505
+ if (workspaceSize > 0) {
506
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
507
+ workspaceAddr = workspace_allocator.get();
508
+ }
509
+
510
+ ACL_CHECK(
511
+ aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
512
+
513
+ ACL_CHECK(aclDestroyTensor(acl_src));
514
+ ACL_CHECK(aclDestroyTensor(acl_dst));
515
+ ACL_CHECK(aclDestroyTensor(acl_mean_out));
516
+ ACL_CHECK(aclDestroyTensor(acl_rstd_out));
517
+ }
518
+
519
+ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
520
+ ggml_tensor* src0 = dst->src[0];
521
+ ggml_tensor* src1 = dst->src[1];
522
+
523
+ size_t nb1 = ((int32_t*)dst->op_params)[0];
524
+ size_t nb2 = ((int32_t*)dst->op_params)[1];
525
+ size_t nb3 = ((int32_t*)dst->op_params)[2];
526
+ size_t offset = ((int32_t*)dst->op_params)[3];
527
+ bool inplace = (bool)((int32_t*)dst->op_params)[4];
528
+
529
+ size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
530
+
531
+ aclTensor* acl_dst = ggml_cann_create_tensor(
532
+ dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
533
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
534
+
535
+ aclScalar* alpha = nullptr;
536
+ float alphaValue = 1.0f;
537
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
538
+
539
+ uint64_t workspaceSize = 0;
540
+ aclOpExecutor* executor;
541
+ void* workspaceAddr = nullptr;
542
+
543
+ if (!inplace) {
544
+ size_t cpy_size = ggml_nbytes(dst);
545
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
546
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
547
+ aclTensor* acl_src0 = ggml_cann_create_tensor(
548
+ src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
549
+ ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
550
+ &workspaceSize, &executor));
551
+ if (workspaceSize > 0) {
552
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
553
+ workspaceAddr = workspace_allocator.get();
554
+ }
555
+ ACL_CHECK(
556
+ aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
557
+ ACL_CHECK(aclDestroyTensor(acl_src0));
558
+ } else {
559
+ ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
560
+ &workspaceSize, &executor));
561
+ if (workspaceSize > 0) {
562
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
563
+ workspaceAddr = workspace_allocator.get();
564
+ }
565
+ ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
566
+ ctx.stream()));
567
+ }
568
+
569
+ ACL_CHECK(aclDestroyTensor(acl_src1));
570
+ ACL_CHECK(aclDestroyTensor(acl_dst));
571
+ }
572
+
573
+ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
574
+ ggml_tensor* src = dst->src[0];
575
+
576
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
577
+
578
+ GGML_ASSERT(dst->ne[0] == 1);
579
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
580
+
581
+ int64_t reduce_dims_host[] = {3};
582
+ aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1);
583
+
584
+ uint64_t workspaceSize = 0;
585
+ aclOpExecutor* executor;
586
+ void* workspaceAddr = nullptr;
587
+
588
+ ACL_CHECK(aclnnReduceSumGetWorkspaceSize(
589
+ acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst,
590
+ &workspaceSize, &executor));
591
+ if (workspaceSize > 0) {
592
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
593
+ workspaceAddr = workspace_allocator.get();
594
+ }
595
+
596
+ ACL_CHECK(
597
+ aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream()));
598
+
599
+ ACL_CHECK(aclDestroyTensor(acl_src));
600
+ ACL_CHECK(aclDestroyTensor(acl_dst));
601
+ }
602
+
603
+ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
604
+ ggml_tensor* dst) {
605
+ ggml_tensor* src = dst->src[0];
606
+ aclTensor* acl_src =
607
+ ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
608
+ aclTensor* acl_dst =
609
+ ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
610
+
611
+ std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
612
+ auto output_size_array = aclCreateIntArray(output_size.data(), 2);
613
+
614
+ uint64_t workspaceSize = 0;
615
+ aclOpExecutor* executor;
616
+ void* workspaceAddr = nullptr;
617
+
618
+ ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
619
+ acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
620
+ if (workspaceSize > 0) {
621
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
622
+ workspaceAddr = workspace_allocator.get();
623
+ }
624
+
625
+ ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor,
626
+ ctx.stream()));
627
+
628
+ ACL_CHECK(aclDestroyIntArray(output_size_array));
629
+ ACL_CHECK(aclDestroyTensor(acl_src));
630
+ ACL_CHECK(aclDestroyTensor(acl_dst));
631
+ }
632
+
633
+ /**
634
+ * @brief Pads a tensor with a specified value along each dimension.
635
+ *
636
+ * This function performs padding of the source tensor `acl_src` and stores the
637
+ * result in the destination tensor `acl_dst`. The padding values for each
638
+ * dimension are specified in the `paddings` array.
639
+ *
640
+ * @param ctx The context for the CANN backend operations.
641
+ * @param acl_src The source tensor to be padded.
642
+ * @param acl_dst The destination tensor where the padded result will be stored.
643
+ * @param paddings An array specifying the padding values for each dimension.
644
+ * The size of the array should be twice the number of dimensions of the tensor.
645
+ * @param value The value to be used for padding. The default value is 0.0.
646
+ */
647
+ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
648
+ aclTensor* acl_dst, int64_t* paddings,
649
+ float value = 0.0f) {
650
+ aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
651
+ aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
652
+
653
+ uint64_t workspaceSize = 0;
654
+ aclOpExecutor* executor;
655
+ void* workspaceAddr = nullptr;
656
+
657
+ ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize(
658
+ acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor));
659
+
660
+ if (workspaceSize > 0) {
661
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
662
+ workspaceAddr = workspace_allocator.get();
663
+ }
664
+
665
+ ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor,
666
+ ctx.stream()));
667
+
668
+ ACL_CHECK(aclDestroyIntArray(acl_pad));
669
+ ACL_CHECK(aclDestroyScalar(acl_value));
670
+ }
671
+
672
+ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
673
+ ggml_tensor* src = dst->src[0];
674
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
675
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
676
+
677
+ // padding: value in the array means how much distance will be padding.
678
+ // the position of elements in the array means which dirction to padding,
679
+ // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
680
+ // dim2.front, dim2.behind, dim3.front, dim3.behind]
681
+ int64_t paddings[] = {
682
+ 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
683
+ 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
684
+ aclnn_pad(ctx, acl_src, acl_dst, paddings);
685
+
686
+ ACL_CHECK(aclDestroyTensor(acl_dst));
687
+ ACL_CHECK(aclDestroyTensor(acl_src));
688
+ }
689
+
690
+ /**
691
+ * @brief Performs 2D average pooling on the input tensor and stores the result
692
+ * in the destination tensor.
693
+ *
694
+ * This function performs average pooling on the source tensor and stores the
695
+ * result in the destination tensor. The pooling parameters (kernel size,
696
+ * strides, padding) are specified in the `op_params` of the destination tensor.
697
+ *
698
+ * @param ctx The context for the CANN backend operations.
699
+ * @param dst The destination tensor where the result will be stored. The source
700
+ * tensor is referenced by `dst->src[0]`.
701
+ */
702
+ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
703
+ ggml_tensor* dst) {
704
+ ggml_tensor* src = dst->src[0];
705
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
706
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
707
+
708
+ aclTensor* acl_src =
709
+ ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
710
+ aclTensor* acl_dst =
711
+ ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
712
+
713
+ const int32_t* opts = (const int32_t*)dst->op_params;
714
+ const int k0 = opts[1];
715
+ const int k1 = opts[2];
716
+ const int s0 = opts[3];
717
+ const int s1 = opts[4];
718
+ const int p0 = opts[5];
719
+ const int p1 = opts[6];
720
+
721
+ std::vector<int64_t> kernel_dims = {k1, k0};
722
+ std::vector<int64_t> stride_dims = {s1, s0};
723
+ std::vector<int64_t> padding_avg_dims = {p1, p0}; // (padH, padW)
724
+
725
+ auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
726
+ auto* strides = aclCreateIntArray(stride_dims.data(), 2);
727
+ auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
728
+
729
+ bool ceil_mode = false;
730
+ bool count_include_pad = true;
731
+ int64_t divisor_override = 0;
732
+ int8_t cube_math_type = 0;
733
+
734
+ uint64_t workspaceSize = 0;
735
+ aclOpExecutor* executor;
736
+ void* workspaceAddr = nullptr;
737
+
738
+ ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
739
+ acl_src, kernel_size, strides, paddings_avg, ceil_mode,
740
+ count_include_pad, divisor_override, cube_math_type, acl_dst,
741
+ &workspaceSize, &executor));
742
+
743
+ if (workspaceSize > 0) {
744
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
745
+ workspaceAddr = workspace_allocator.get();
746
+ }
747
+ ACL_CHECK(
748
+ aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream()));
749
+
750
+ ACL_CHECK(aclDestroyTensor(acl_src));
751
+ ACL_CHECK(aclDestroyTensor(acl_dst));
752
+ ACL_CHECK(aclDestroyIntArray(kernel_size));
753
+ ACL_CHECK(aclDestroyIntArray(strides));
754
+ ACL_CHECK(aclDestroyIntArray(paddings_avg));
755
+ }
756
+
757
+ /**
758
+ * @brief Performs 2D max pooling on the input tensor and stores the result in
759
+ * the destination tensor.
760
+ *
761
+ * This function performs max pooling on the source tensor and stores the result
762
+ * in the destination tensor. The pooling parameters (kernel size, strides,
763
+ * padding) are specified in the `op_params` of the destination tensor.
764
+ *
765
+ * @param ctx The context for the CANN backend operations.
766
+ * @param dst The destination tensor where the result will be stored. The source
767
+ * tensor is referenced by `dst->src[0]`.
768
+ */
769
+ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
770
+ ggml_tensor* dst) {
771
+ ggml_tensor* src = dst->src[0];
772
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
773
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
774
+
775
+ aclTensor* acl_src =
776
+ ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
777
+ aclTensor* acl_dst =
778
+ ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
779
+
780
+ const int32_t* opts = (const int32_t*)dst->op_params;
781
+ const int k0 = opts[1];
782
+ const int k1 = opts[2];
783
+ const int s0 = opts[3];
784
+ const int s1 = opts[4];
785
+ const int p0 = opts[5];
786
+ const int p1 = opts[6];
787
+
788
+ int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
789
+ src->ne[3]};
790
+ size_t temp_nb[GGML_MAX_DIMS];
791
+
792
+ temp_nb[0] = ggml_element_size(src);
793
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
794
+ temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
795
+ }
796
+
797
+ ggml_cann_pool_alloc temp_buffer_allocator(
798
+ ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
799
+ void* buffer = temp_buffer_allocator.get();
800
+ aclTensor* tmp_tensor = ggml_cann_create_tensor(
801
+ buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
802
+ GGML_MAX_DIMS, ACL_FORMAT_NCHW);
803
+
804
+ // pad: see padding in ggml_cann_pad()
805
+ int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
806
+ float value = -FLT_MAX;
807
+ aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
808
+
809
+ // max_pool
810
+ std::vector<int64_t> kernel_dims = {k1, k0};
811
+ std::vector<int64_t> stride_dims = {s1, s0};
812
+ // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
813
+ std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
814
+ std::vector<int64_t> dilation_size = {1, 1};
815
+ auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
816
+ auto* strides = aclCreateIntArray(stride_dims.data(), 2);
817
+ auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
818
+ auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
819
+
820
+ bool ceil_mode = false;
821
+ int64_t auto_pads = 0;
822
+
823
+ uint64_t workspaceSize = 0;
824
+ aclOpExecutor* executor;
825
+ void* workspaceAddr = nullptr;
826
+
827
+ ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
828
+ tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
829
+ ceil_mode, acl_dst, &workspaceSize, &executor));
830
+ if (workspaceSize > 0) {
831
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
832
+ workspaceAddr = workspace_allocator.get();
833
+ }
834
+
835
+ ACL_CHECK(
836
+ aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream()));
837
+
838
+ ACL_CHECK(aclDestroyTensor(acl_src));
839
+ ACL_CHECK(aclDestroyTensor(acl_dst));
840
+ ACL_CHECK(aclDestroyTensor(tmp_tensor));
841
+ ACL_CHECK(aclDestroyIntArray(kernel_size));
842
+ ACL_CHECK(aclDestroyIntArray(strides));
843
+ ACL_CHECK(aclDestroyIntArray(paddings_max));
844
+ ACL_CHECK(aclDestroyIntArray(dilations));
845
+ }
846
+
847
+ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
848
+ const int32_t* opts = (const int32_t*)dst->op_params;
849
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
850
+ switch (op) {
851
+ case GGML_OP_POOL_AVG:
852
+ ggml_cann_avg_pool2d(ctx, dst);
853
+ break;
854
+ case GGML_OP_POOL_MAX:
855
+ ggml_cann_max_pool2d(ctx, dst);
856
+ break;
857
+ case GGML_OP_POOL_COUNT:
858
+ GGML_ABORT("fatal error");
859
+ break;
860
+ }
861
+ }
862
+
863
+ /**
864
+ * @brief Copies data from the source tensor to the destination tensor.
865
+ *
866
+ * This function copies data from the source tensor `acl_src` to the destination
867
+ * tensor `acl_dst`.
868
+ *
869
+ * @param ctx The context for the CANN backend operations.
870
+ * @param acl_src The source tensor from which data will be copied.
871
+ * @param acl_dst The destination tensor where the data will be copied to.
872
+ */
873
+ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
874
+ aclTensor* acl_dst) {
875
+ uint64_t workspaceSize = 0;
876
+ aclOpExecutor* executor;
877
+ void* workspaceAddr = nullptr;
878
+
879
+ ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize,
880
+ &executor));
881
+
882
+ if (workspaceSize > 0) {
883
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
884
+ workspaceAddr = workspace_allocator.get();
885
+ }
886
+
887
+ ACL_CHECK(
888
+ aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream()));
889
+ }
890
+
891
+ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
892
+ ggml_tensor* src = dst->src[0];
893
+
894
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
895
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
896
+
897
+ ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
898
+ ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
899
+ src->extra = src_extra_allocator.get();
900
+ dst->extra = dst_extra_allocator.get();
901
+ ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
902
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
903
+ ctx.stream()));
904
+ ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
905
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
906
+ ctx.stream()));
907
+
908
+ if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
909
+ ggml_are_same_shape(src, dst)) {
910
+ cann_copy(ctx, acl_src, acl_dst);
911
+ ACL_CHECK(aclDestroyTensor(acl_src));
912
+ ACL_CHECK(aclDestroyTensor(acl_dst));
913
+ return;
914
+ }
915
+ // TODO: simplify
916
+ if (src->type == GGML_TYPE_F16) {
917
+ if (dst->type == GGML_TYPE_Q8_0) {
918
+ aclrtlaunch_ascendc_quantize_f16_q8_0(
919
+ 24, ctx.stream(), src->data, dst->data,
920
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
921
+ ((ggml_tensor*)dst->extra)->ne);
922
+ return;
923
+ }
924
+ if (dst->type == GGML_TYPE_Q4_0) {
925
+ aclrtlaunch_ascendc_quantize_f16_to_q4_0(
926
+ 24, ctx.stream(), src->data, dst->data,
927
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
928
+ ((ggml_tensor*)dst->extra)->ne);
929
+ return;
930
+ }
931
+ if (dst->type == GGML_TYPE_F16) {
932
+ if (ggml_are_same_shape(src, dst)) {
933
+ cann_copy(ctx, acl_src, acl_dst);
934
+ ACL_CHECK(aclDestroyTensor(acl_src));
935
+ ACL_CHECK(aclDestroyTensor(acl_dst));
936
+ return;
937
+ }
938
+ if (ggml_is_contiguous(dst)) {
939
+ const size_t src_type_size = ggml_type_size(src->type);
940
+ if (src->nb[0] == src_type_size) {
941
+ // src0 is contigous on first dimension, copy by rows
942
+ int64_t rows_num = ggml_nrows(src);
943
+
944
+ aclrtlaunch_ascendc_dup_by_rows_fp16(
945
+ rows_num, ctx.stream(), src->data, dst->data,
946
+ ((ggml_tensor*)src->extra)->ne,
947
+ ((ggml_tensor*)src->extra)->nb,
948
+ ((ggml_tensor*)dst->extra)->ne,
949
+ ((ggml_tensor*)dst->extra)->nb);
950
+ return;
951
+ }
952
+ GGML_ABORT("fatal error");
953
+ }
954
+ GGML_ABORT("fatal error");
955
+ }
956
+ if (dst->type == GGML_TYPE_F32) {
957
+ if (ggml_are_same_shape(src, dst)) {
958
+ cann_copy(ctx, acl_src, acl_dst);
959
+ ACL_CHECK(aclDestroyTensor(acl_src));
960
+ ACL_CHECK(aclDestroyTensor(acl_dst));
961
+ return;
962
+ }
963
+ if (ggml_is_contiguous(dst)) {
964
+ const size_t src_type_size = ggml_type_size(src->type);
965
+ if (src->nb[0] == src_type_size) {
966
+ // src0 is contigous on first dimension, copy by rows
967
+ int64_t rows_num = ggml_nrows(src);
968
+ aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
969
+ rows_num, ctx.stream(), src->data, dst->data,
970
+ ((ggml_tensor*)src->extra)->ne,
971
+ ((ggml_tensor*)src->extra)->nb,
972
+ ((ggml_tensor*)dst->extra)->ne,
973
+ ((ggml_tensor*)dst->extra)->nb);
974
+ return;
975
+ }
976
+ GGML_ABORT("fatal error");
977
+ }
978
+ GGML_ABORT("fatal error");
979
+ }
980
+ // TODO
981
+ GGML_ABORT("fatal error");
982
+ } else if (src->type == GGML_TYPE_F32) {
983
+ // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
984
+ // && nb0 == type_size)
985
+ if (dst->type == GGML_TYPE_Q8_0) {
986
+ aclrtlaunch_ascendc_quantize_f32_q8_0(
987
+ 24, ctx.stream(), src->data, dst->data,
988
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
989
+ ((ggml_tensor*)dst->extra)->ne);
990
+ return;
991
+ }
992
+ if (dst->type == GGML_TYPE_Q4_0) {
993
+ aclrtlaunch_ascendc_quantize_f32_to_q4_0(
994
+ 24, ctx.stream(), src->data, dst->data,
995
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
996
+ ((ggml_tensor*)dst->extra)->ne);
997
+ return;
998
+ }
999
+ if (dst->type == GGML_TYPE_F32) {
1000
+ if (ggml_are_same_shape(src, dst)) {
1001
+ cann_copy(ctx, acl_src, acl_dst);
1002
+ ACL_CHECK(aclDestroyTensor(acl_src));
1003
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1004
+ return;
1005
+ }
1006
+ if (ggml_is_contiguous(dst)) {
1007
+ const size_t src_type_size = ggml_type_size(src->type);
1008
+ if (src->nb[0] == src_type_size) {
1009
+ // src0 is contigous on first dimension, copy by rows
1010
+ int64_t rows_num = ggml_nrows(src);
1011
+ aclrtlaunch_ascendc_dup_by_rows_fp32(
1012
+ rows_num, ctx.stream(), src->data, dst->data,
1013
+ ((ggml_tensor*)src->extra)->ne,
1014
+ ((ggml_tensor*)src->extra)->nb,
1015
+ ((ggml_tensor*)dst->extra)->ne,
1016
+ ((ggml_tensor*)dst->extra)->nb);
1017
+ return;
1018
+ }
1019
+ GGML_ABORT("fatal error");
1020
+ } else {
1021
+ // TODO: dst not contiguous
1022
+ GGML_ABORT("fatal error");
1023
+ }
1024
+ }
1025
+ if (dst->type == GGML_TYPE_F16) {
1026
+ if (ggml_are_same_shape(src, dst)) {
1027
+ cann_copy(ctx, acl_src, acl_dst);
1028
+ ACL_CHECK(aclDestroyTensor(acl_src));
1029
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1030
+ return;
1031
+ }
1032
+ if (ggml_is_contiguous(dst)) {
1033
+ const size_t src_type_size = ggml_type_size(src->type);
1034
+ if (src->nb[0] == src_type_size) {
1035
+ // src0 is contigous on first dimension, copy by rows
1036
+ int64_t rows_num = ggml_nrows(src);
1037
+ aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
1038
+ rows_num, ctx.stream(), src->data, dst->data,
1039
+ ((ggml_tensor*)src->extra)->ne,
1040
+ ((ggml_tensor*)src->extra)->nb,
1041
+ ((ggml_tensor*)dst->extra)->ne,
1042
+ ((ggml_tensor*)dst->extra)->nb);
1043
+ return;
1044
+ }
1045
+ GGML_ABORT("fatal error");
1046
+ }
1047
+ }
1048
+ // TODO
1049
+ GGML_ABORT("fatal error");
1050
+ } else {
1051
+ if (ggml_are_same_shape(src, dst)) {
1052
+ cann_copy(ctx, acl_src, acl_dst);
1053
+ ACL_CHECK(aclDestroyTensor(acl_src));
1054
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1055
+ return;
1056
+ }
1057
+ GGML_ABORT("fatal error");
1058
+ }
1059
+ }
1060
+
1061
+ #ifdef __cplusplus
1062
+ extern "C" {
1063
+ #endif
1064
+ aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x,
1065
+ const aclTensor* gamma, double epsilon,
1066
+ const aclTensor* yOut,
1067
+ const aclTensor* rstdOout,
1068
+ uint64_t* workspaceSize,
1069
+ aclOpExecutor** executor);
1070
+ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
1071
+ aclOpExecutor* executor, aclrtStream stream);
1072
+ #ifdef __cplusplus
1073
+ }
1074
+ #endif
1075
+
1076
+ /**
1077
+ * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
1078
+ *
1079
+ * This function initializes a tensor with zeros using the specified buffer and
1080
+ * tensor parameters.
1081
+ *
1082
+ * @param ctx The context for the CANN backend operations.
1083
+ * @param buffer The buffer to be used for the tensor data.
1084
+ * @param n_bytes The size of the buffer in bytes.
1085
+ * @param ne An array specifying the extents (sizes) of each dimension of the
1086
+ * tensor.
1087
+ * @param dims The number of dimensions of the tensor.
1088
+ * @param type The data type of the tensor.
1089
+ * @param type_size The size of each element in the tensor data type.
1090
+ * @return An ACL tensor initialized with zeros.
1091
+ */
1092
+ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
1093
+ size_t n_bytes, int64_t* ne, int64_t dims,
1094
+ aclDataType type, size_t type_size) {
1095
+ size_t nb[GGML_MAX_DIMS];
1096
+ nb[0] = type_size;
1097
+ for (int i = 1; i < dims; i++) {
1098
+ nb[i] = nb[i - 1] * ne[i - 1];
1099
+ }
1100
+
1101
+ ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
1102
+ aclTensor* zero =
1103
+ ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
1104
+ return zero;
1105
+ }
1106
+
1107
+ /**
1108
+ * @brief Creates an ACL tensor initialized with value using a provided buffer.
1109
+ *
1110
+ * This function initializes a tensor with value using the specified buffer and
1111
+ * tensor parameters.
1112
+ *
1113
+ * @param ctx The context for the CANN backend operations.
1114
+ * @param buffer The buffer to be used for the tensor data.
1115
+ * @param n_bytes The size of the buffer in bytes.
1116
+ * @param ne An array specifying the extents (sizes) of each dimension of the
1117
+ * tensor.
1118
+ * @param dims The number of dimensions of the tensor.
1119
+ * @param type The data type of the tensor.
1120
+ * @param type_size The size of each element in the tensor data type.
1121
+ * @param value The value to be used for initializing the tensor (default
1122
+ * is 1.0).
1123
+ * @return An ACL tensor initialized with value.
1124
+ */
1125
+ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
1126
+ size_t n_bytes, int64_t* ne, int64_t dims,
1127
+ aclDataType type, size_t type_size,
1128
+ float value = 1.0f) {
1129
+ aclTensor* acl_tensor =
1130
+ aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
1131
+ float alpha_host = 1.0f;
1132
+ aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
1133
+ aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
1134
+
1135
+ uint64_t workspaceSize = 0;
1136
+ aclOpExecutor* executor;
1137
+ void* workspaceAddr = nullptr;
1138
+
1139
+ ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha,
1140
+ &workspaceSize, &executor));
1141
+
1142
+ if (workspaceSize > 0) {
1143
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1144
+ workspaceAddr = workspace_allocator.get();
1145
+ }
1146
+ ACL_CHECK(
1147
+ aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream()));
1148
+
1149
+ return acl_tensor;
1150
+ }
1151
+
1152
+ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1153
+ ggml_tensor* src = dst->src[0];
1154
+
1155
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
1156
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1157
+
1158
+ float eps;
1159
+ memcpy(&eps, dst->op_params, sizeof(float));
1160
+
1161
+ GGML_ASSERT(eps > 0.0f);
1162
+
1163
+ uint64_t workspaceSize = 0;
1164
+ aclOpExecutor* executor;
1165
+ void* workspaceAddr = nullptr;
1166
+
1167
+ size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
1168
+ ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
1169
+
1170
+ aclTensor* acl_gamma = aclnn_values(
1171
+ ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
1172
+ ggml_cann_type_mapping(src->type), ggml_element_size(src));
1173
+
1174
+ size_t zero_tensor_n_bytes =
1175
+ src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
1176
+ ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
1177
+ aclTensor* acl_rstd =
1178
+ aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
1179
+ src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
1180
+ ggml_element_size(src));
1181
+
1182
+ ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
1183
+ acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
1184
+
1185
+ if (workspaceSize > 0) {
1186
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1187
+ workspaceAddr = workspace_allocator.get();
1188
+ }
1189
+
1190
+ ACL_CHECK(
1191
+ aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
1192
+
1193
+ ACL_CHECK(aclDestroyTensor(acl_src));
1194
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1195
+ ACL_CHECK(aclDestroyTensor(acl_gamma));
1196
+ ACL_CHECK(aclDestroyTensor(acl_rstd));
1197
+ }
1198
+
1199
+ // TODO: performace is low.
1200
+ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
1201
+ float value) {
1202
+ ggml_tensor* src = dst->src[0];
1203
+
1204
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
1205
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1206
+
1207
+ const int n_past = ((int32_t*)dst->op_params)[0];
1208
+
1209
+ size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
1210
+ src->ne[3] * ggml_element_size(src);
1211
+ ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
1212
+
1213
+ aclTensor* mask_tensor =
1214
+ aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
1215
+ src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
1216
+ ggml_element_size(src), value);
1217
+
1218
+ uint64_t workspaceSize = 0;
1219
+ aclOpExecutor* executor;
1220
+ void* workspaceAddr = nullptr;
1221
+
1222
+ ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
1223
+ &workspaceSize, &executor));
1224
+ if (workspaceSize > 0) {
1225
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1226
+ workspaceAddr = workspace_allocator.get();
1227
+ }
1228
+
1229
+ ACL_CHECK(
1230
+ aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
1231
+
1232
+ ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
1233
+ &workspaceSize, &executor));
1234
+ if (workspaceSize > 0) {
1235
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1236
+ workspaceAddr = workspace_allocator.get();
1237
+ }
1238
+
1239
+ ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream()));
1240
+
1241
+ aclScalar* alpha = nullptr;
1242
+ float alphaValue = 1.0f;
1243
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
1244
+
1245
+ ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha,
1246
+ &workspaceSize, &executor));
1247
+ if (workspaceSize > 0) {
1248
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1249
+ workspaceAddr = workspace_allocator.get();
1250
+ }
1251
+ ACL_CHECK(
1252
+ aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
1253
+
1254
+ ACL_CHECK(aclDestroyScalar(alpha));
1255
+ ACL_CHECK(aclDestroyTensor(mask_tensor));
1256
+ ACL_CHECK(aclDestroyTensor(acl_src));
1257
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1258
+ }
1259
+
1260
+ /**
1261
+ * @brief Casts the data type of a source tensor to a destination tensor.
1262
+ *
1263
+ * This function casts the data type of the source tensor `acl_src` to the
1264
+ * specified data type `cast_data_type` and stores the result in the destination
1265
+ * tensor `acl_dst`.
1266
+ *
1267
+ * @param ctx The context for the CANN backend operations.
1268
+ * @param acl_src The source tensor whose data type will be casted.
1269
+ * @param acl_dst The destination tensor where the casted result will be stored.
1270
+ * @param cast_data_type The target data type to which the source tensor will be
1271
+ * casted.
1272
+ */
1273
+ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1274
+ aclTensor* acl_dst, aclDataType cast_data_type) {
1275
+ uint64_t workspaceSize = 0;
1276
+ aclOpExecutor* executor;
1277
+ void* workspaceAddr = nullptr;
1278
+
1279
+ ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
1280
+ &workspaceSize, &executor));
1281
+ if (workspaceSize > 0) {
1282
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1283
+ workspaceAddr = workspace_allocator.get();
1284
+ }
1285
+
1286
+ ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
1287
+ }
1288
+
1289
+ /**
1290
+ * @brief Permutes the dimensions of a tensor according to a specified order.
1291
+ *
1292
+ * This function permutes the dimensions of the source tensor `acl_src`
1293
+ * according to the order specified in the `new_dim` array and stores the result
1294
+ * in the destination tensor `acl_dst`.
1295
+ *
1296
+ * @param ctx The context for the CANN backend operations.
1297
+ * @param acl_src The source tensor whose dimensions will be permuted.
1298
+ * @param acl_dst The destination tensor where the permuted result will be
1299
+ * stored.
1300
+ * @param new_dim An array specifying the new order of dimensions for the
1301
+ * tensor.
1302
+ * @param dims The number of dimensions in the tensor.
1303
+ */
1304
+ static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1305
+ aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
1306
+ aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
1307
+
1308
+ uint64_t workspaceSize = 0;
1309
+ aclOpExecutor* executor;
1310
+ void* workspaceAddr = nullptr;
1311
+
1312
+ ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
1313
+ &workspaceSize, &executor));
1314
+ if (workspaceSize > 0) {
1315
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1316
+ workspaceAddr = workspace_allocator.get();
1317
+ }
1318
+
1319
+ ACL_CHECK(
1320
+ aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
1321
+
1322
+ ACL_CHECK(aclDestroyIntArray(acl_dims));
1323
+ }
1324
+
1325
+ #ifdef __cplusplus
1326
+ extern "C" {
1327
+ #endif
1328
+ aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
1329
+ const aclIntArray* kernelSize,
1330
+ const aclIntArray* dilation,
1331
+ const aclIntArray* padding,
1332
+ const aclIntArray* stride,
1333
+ aclTensor* out, uint64_t* workspaceSize,
1334
+ aclOpExecutor** executor);
1335
+ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
1336
+ aclOpExecutor* executor, aclrtStream stream);
1337
+ #ifdef __cplusplus
1338
+ }
1339
+ #endif
1340
+
1341
+ static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
1342
+ ggml_tensor* dst,
1343
+ ggml_tensor* src1,
1344
+ aclTensor* tmp_cast_tensor,
1345
+ aclTensor* tmp_im2col_tensor) {
1346
+ // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1347
+ int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
1348
+ size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1349
+ aclTensor* acl_dst =
1350
+ ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1351
+
1352
+ int64_t permute_dim[] = {0, 2, 1};
1353
+ if (src1->type != dst->type) {
1354
+ aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
1355
+ } else {
1356
+ aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
1357
+ }
1358
+
1359
+ // release
1360
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1361
+ }
1362
+
1363
+ static void ggml_cann_im2col_1d_post_process(
1364
+ ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
1365
+ aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
1366
+ const std::vector<int64_t>& im2col_op_params) {
1367
+ // get params
1368
+ const int64_t KH = im2col_op_params[0];
1369
+ const int64_t KW = im2col_op_params[1];
1370
+ const int64_t IW = im2col_op_params[2];
1371
+ const int64_t IC = im2col_op_params[3];
1372
+ const int64_t N = im2col_op_params[4];
1373
+ const int64_t OH = im2col_op_params[5];
1374
+ const int64_t OW = im2col_op_params[6];
1375
+ const int64_t s0 = im2col_op_params[7];
1376
+ const int64_t p0 = im2col_op_params[8];
1377
+ const int64_t d0 = im2col_op_params[9];
1378
+ const int64_t n_bytes_factor = im2col_op_params[10];
1379
+
1380
+ // Permute: [N, IC * KH * KW, OW * OH] ->
1381
+ // [N, OW * OH * n_bytes_factor, IC * KH * KW]
1382
+ aclTensor* tmp_permute_tensor = nullptr;
1383
+ ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
1384
+ tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1385
+ void* tmp_permute_buffer = tmp_permute_allocator.get();
1386
+
1387
+ int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
1388
+ size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
1389
+ tmp_permute_nb[0] = ggml_type_size(dst->type);
1390
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1391
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1392
+ }
1393
+
1394
+ tmp_permute_tensor = ggml_cann_create_tensor(
1395
+ tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
1396
+ ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
1397
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1398
+
1399
+ int64_t permute_dim[] = {0, 2, 1};
1400
+ if (src1->type != dst->type) {
1401
+ aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
1402
+ } else {
1403
+ aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
1404
+ 3);
1405
+ }
1406
+
1407
+ // number of times the kernel moves in W dimension
1408
+ const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
1409
+ size_t offset;
1410
+ void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
1411
+
1412
+ // memory copy with offset to restore 1D im2col from 2d
1413
+ if (IC > 1) {
1414
+ offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
1415
+ size_t size_cpy = KH * KW * ggml_type_size(dst->type);
1416
+
1417
+ for (int c = 0; c < IC; c++) {
1418
+ cur_permute_buffer = (char*)tmp_permute_buffer + offset +
1419
+ KH * KW * c * ggml_type_size(dst->type);
1420
+ cur_dst_buffer = (char*)dst->data +
1421
+ c * KH * KW * n_step_w * ggml_type_size(dst->type);
1422
+
1423
+ for (int i = 0; i < n_step_w; i++) {
1424
+ ACL_CHECK(aclrtMemcpyAsync(
1425
+ cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
1426
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1427
+ cur_dst_buffer =
1428
+ (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1429
+ cur_permute_buffer = (char*)cur_permute_buffer +
1430
+ KH * KW * IC * ggml_type_size(dst->type);
1431
+ }
1432
+ }
1433
+ } else {
1434
+ offset = KH * KW * n_step_w *
1435
+ ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1436
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
1437
+ (char*)tmp_permute_buffer + offset, offset,
1438
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1439
+ }
1440
+
1441
+ // release
1442
+ ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
1443
+ }
1444
+
1445
+ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1446
+ ggml_tensor* src0 = dst->src[0]; // kernel
1447
+ ggml_tensor* src1 = dst->src[1]; // input
1448
+
1449
+ GGML_TENSOR_BINARY_OP_LOCALS;
1450
+
1451
+ // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
1452
+ // im2col and do post-processing to restore it to 1D.
1453
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
1454
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
1455
+ const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
1456
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
1457
+ const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
1458
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
1459
+ const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
1460
+
1461
+ const int64_t N = ne13;
1462
+ const int64_t IC = ne12;
1463
+ const int64_t KH = ne01;
1464
+ const int64_t KW = ne00;
1465
+ const int64_t IW = ne10;
1466
+
1467
+ const int64_t OH = is_2D ? ne2 : 1;
1468
+ const int64_t OW = ne1;
1469
+
1470
+ // memory allocated increased to 3x when is_2D == false
1471
+ const int64_t n_bytes_factor = is_2D ? 1 : 3;
1472
+
1473
+ // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
1474
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1475
+ int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
1476
+ size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1477
+
1478
+ tmp_im2col_nb[0] = ggml_type_size(src1->type);
1479
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1480
+ tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
1481
+ }
1482
+
1483
+ // Calculate im2col.
1484
+ // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1485
+ // dst.elemcount.
1486
+ ggml_cann_pool_alloc im2col_allocator(
1487
+ ctx.pool(),
1488
+ ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
1489
+ void* tmp_im2col_buffer = im2col_allocator.get();
1490
+
1491
+ aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
1492
+ tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
1493
+ ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
1494
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1495
+
1496
+ std::vector<int64_t> kernel_dims = {KH, KW};
1497
+ std::vector<int64_t> dilation_size = {d1, d0};
1498
+ std::vector<int64_t> padding_dims = {p1, p0};
1499
+ std::vector<int64_t> stride_dims = {s1, s0};
1500
+ auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1501
+ auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1502
+ auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1503
+ auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1504
+
1505
+ uint64_t workspaceSize = 0;
1506
+ aclOpExecutor* executor;
1507
+ void* workspaceAddr = nullptr;
1508
+
1509
+ ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
1510
+ paddings, strides, tmp_im2col_tensor,
1511
+ &workspaceSize, &executor));
1512
+
1513
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool());
1514
+ if (workspaceSize > 0) {
1515
+ workspace_allocator.alloc(workspaceSize);
1516
+ workspaceAddr = workspace_allocator.get();
1517
+ }
1518
+
1519
+ ACL_CHECK(
1520
+ aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream()));
1521
+
1522
+ // Cast if dst is f16.
1523
+ aclTensor* tmp_cast_tensor = nullptr;
1524
+ ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
1525
+ void* tmp_cast_buffer = nullptr;
1526
+ if (src1->type != dst->type) {
1527
+ tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1528
+ tmp_cast_buffer = tmp_cast_allocator.get();
1529
+ size_t temp_cast_nb[GGML_MAX_DIMS - 1];
1530
+ temp_cast_nb[0] = ggml_type_size(dst->type);
1531
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1532
+ temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
1533
+ }
1534
+
1535
+ tmp_cast_tensor = ggml_cann_create_tensor(
1536
+ tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
1537
+ ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
1538
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1539
+ aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
1540
+ ggml_cann_type_mapping(dst->type));
1541
+ }
1542
+
1543
+ // post-processing
1544
+ if (is_2D) {
1545
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
1546
+ tmp_im2col_tensor);
1547
+ } else {
1548
+ std::vector<int64_t> im2col_op_params = {
1549
+ KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
1550
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
1551
+ tmp_im2col_tensor, im2col_op_params);
1552
+ }
1553
+
1554
+ // release
1555
+ ACL_CHECK(aclDestroyTensor(acl_src1));
1556
+ ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
1557
+ ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
1558
+ ACL_CHECK(aclDestroyIntArray(kernel_size));
1559
+ ACL_CHECK(aclDestroyIntArray(dilations));
1560
+ ACL_CHECK(aclDestroyIntArray(paddings));
1561
+ ACL_CHECK(aclDestroyIntArray(strides));
1562
+ }
1563
+
1564
+ /**
1565
+ * @brief Applies element-wise exponential function to the elements of a tensor.
1566
+ *
1567
+ * This function computes the exponential of each element in the source tensor
1568
+ * `acl_src` and stores the result back into the same tensor.
1569
+ * The operation is defined as:
1570
+ * \f[
1571
+ * \text {acl_src }_i=e^{acl\_src_i}
1572
+ * \f]
1573
+ *
1574
+ * @param ctx The context for the CANN backend operations.
1575
+ * @param acl_src The tensor on which the exponential function will be applied.
1576
+ */
1577
+ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1578
+ uint64_t workspaceSize = 0;
1579
+ aclOpExecutor* executor;
1580
+ void* workspaceAddr = nullptr;
1581
+
1582
+ ACL_CHECK(
1583
+ aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor));
1584
+ if (workspaceSize > 0) {
1585
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1586
+ workspaceAddr = workspace_allocator.get();
1587
+ }
1588
+
1589
+ ACL_CHECK(
1590
+ aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
1591
+ }
1592
+
1593
+ /**
1594
+ * @brief Multiplies elements of a tensor by a scalar value, optionally
1595
+ * in-place.
1596
+ *
1597
+ * This function multiplies each element of the source tensor `acl_src` by the
1598
+ * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
1599
+ * `inplace` is true, `acl_dst` will not be used and the operation is performed
1600
+ * in-place on `acl_src`.
1601
+ * The operation is defined as:
1602
+ * \f[
1603
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
1604
+ * \f]
1605
+ *
1606
+ * @param ctx The context for the CANN backend operations.
1607
+ * @param acl_src The source tensor whose elements will be multiplied.
1608
+ * @param scale The scalar value by which each element of `acl_src` will be
1609
+ * multiplied.
1610
+ * @param acl_dst The destination tensor where the result will be stored if
1611
+ * `inplace` is false.
1612
+ * @param inplace Flag indicating whether to perform the operation in-place on
1613
+ * `acl_src`.
1614
+ */
1615
+ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1616
+ float scale, aclTensor* acl_dst, bool inplace) {
1617
+ aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
1618
+
1619
+ uint64_t workspaceSize = 0;
1620
+ aclOpExecutor* executor;
1621
+ void* workspaceAddr = nullptr;
1622
+
1623
+ if (inplace) {
1624
+ ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
1625
+ &workspaceSize, &executor));
1626
+ if (workspaceSize > 0) {
1627
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1628
+ workspaceAddr = workspace_allocator.get();
1629
+ }
1630
+
1631
+ ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
1632
+ ctx.stream()));
1633
+ } else {
1634
+ ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
1635
+ &workspaceSize, &executor));
1636
+ if (workspaceSize > 0) {
1637
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1638
+ workspaceAddr = workspace_allocator.get();
1639
+ }
1640
+
1641
+ ACL_CHECK(
1642
+ aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
1643
+ }
1644
+
1645
+ ACL_CHECK(aclDestroyScalar(acl_scale));
1646
+ }
1647
+
1648
+ /**
1649
+ * @brief Performs an in-place element-wise multiplication of two tensors.
1650
+ *
1651
+ * This function performs an element-wise multiplication of the tensors
1652
+ * `acl_src` and `acl_other` and stores the result in `acl_src`.
1653
+ * The operation is defined as:
1654
+ * \f[
1655
+ * \text {acl_src }_i=\text {acl_src }_i \times \text {acl_other }_i
1656
+ * \f]
1657
+ *
1658
+ * @param ctx The context for the CANN backend operations.
1659
+ * @param acl_src The source tensor where the multiplication result will be
1660
+ * stored.
1661
+ * @param acl_other The tensor whose elements will be multiplied with `acl_src`.
1662
+ */
1663
+ static void aclnn_inplace_mul(ggml_backend_cann_context& ctx,
1664
+ aclTensor* acl_src, aclTensor* acl_other) {
1665
+ uint64_t workspaceSize = 0;
1666
+ aclOpExecutor* executor;
1667
+ void* workspaceAddr = nullptr;
1668
+
1669
+ ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
1670
+ &workspaceSize, &executor));
1671
+ if (workspaceSize > 0) {
1672
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1673
+ workspaceAddr = workspace_allocator.get();
1674
+ }
1675
+
1676
+ ACL_CHECK(
1677
+ aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1678
+ }
1679
+
1680
+ /**
1681
+ * @brief Performs element-wise multiplication of two tensors and stores the
1682
+ * result in a destination tensor.
1683
+ *
1684
+ * This function performs element-wise multiplication of the tensors `acl_src`
1685
+ * and `acl_other` and stores the result in the destination tensor `acl_dst`.
1686
+ * The operation is defined as:
1687
+ * \f[
1688
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
1689
+ * \f]
1690
+ *
1691
+ * @param ctx The context for the CANN backend operations.
1692
+ * @param acl_src The first tensor for element-wise multiplication.
1693
+ * @param acl_other The second tensor for element-wise multiplication.
1694
+ * @param acl_dst The destination tensor where the result will be stored.
1695
+ */
1696
+ static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1697
+ aclTensor* acl_other, aclTensor* acl_dst) {
1698
+ uint64_t workspaceSize = 0;
1699
+ aclOpExecutor* executor;
1700
+ void* workspaceAddr = nullptr;
1701
+
1702
+ ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
1703
+ &workspaceSize, &executor));
1704
+ if (workspaceSize > 0) {
1705
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1706
+ workspaceAddr = workspace_allocator.get();
1707
+ }
1708
+
1709
+ ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
1710
+ }
1711
+
1712
+ /**
1713
+ * @brief Applies element-wise cosine function to the elements of a tensor.
1714
+ *
1715
+ * This function computes the cosine of each element in the source tensor
1716
+ * `acl_src` and stores the result in the destination tensor `acl_dst`. The
1717
+ * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
1718
+ * }_i\right) \f]
1719
+ *
1720
+ * @param ctx The context for the CANN backend operations.
1721
+ * @param acl_src The source tensor on which the cosine function will be
1722
+ * applied.
1723
+ * @param acl_dst The destination tensor where the cosine results will be
1724
+ * stored.
1725
+ */
1726
+ static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1727
+ aclTensor* acl_dst) {
1728
+ uint64_t workspaceSize = 0;
1729
+ aclOpExecutor* executor;
1730
+ void* workspaceAddr = nullptr;
1731
+
1732
+ ACL_CHECK(
1733
+ aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
1734
+ if (workspaceSize > 0) {
1735
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1736
+ workspaceAddr = workspace_allocator.get();
1737
+ }
1738
+
1739
+ ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
1740
+ }
1741
+
1742
+ /**
1743
+ * @brief Applies element-wise sine function to the elements of a tensor.
1744
+ *
1745
+ * This function computes the sine of each element in the source tensor
1746
+ `acl_src`
1747
+ * and stores the result in the destination tensor `acl_dst`.
1748
+ * The operation is defined as:
1749
+ * \f[
1750
+ * \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
1751
+ * \f]
1752
+
1753
+ * @param ctx The context for the CANN backend operations.
1754
+ * @param acl_src The source tensor on which the sine function will be applied.
1755
+ * @param acl_dst The destination tensor where the sine results will be stored.
1756
+ */
1757
+ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1758
+ aclTensor* acl_dst) {
1759
+ uint64_t workspaceSize = 0;
1760
+ aclOpExecutor* executor;
1761
+ void* workspaceAddr = nullptr;
1762
+
1763
+ ACL_CHECK(
1764
+ aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
1765
+ if (workspaceSize > 0) {
1766
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1767
+ workspaceAddr = workspace_allocator.get();
1768
+ }
1769
+
1770
+ ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
1771
+ }
1772
+
1773
+ /**
1774
+ * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
1775
+ result by the scalar value and adds it to self .
1776
+ *
1777
+ * Performs element-wise division of tensor1 by tensor2,
1778
+ * multiplies the result by the scalar value and adds it to self .
1779
+ * The operation is defined as:
1780
+ * \f[
1781
+ * \text{out}_i = \text{selft}_i + \text{value} \times
1782
+ \frac{\text{tensor1}_i}{\text{tensor2}_i}
1783
+ * \f]
1784
+
1785
+ * @param ctx The context for the CANN backend operations.
1786
+ * @param acl_self The source tensor on which the addcdiv function will be
1787
+ applied.
1788
+ * @param tensor1 Numerator tensor.
1789
+ * @param tensor2 Denominator tensor.
1790
+ * @param value The value to be used for coefficient.
1791
+ */
1792
+ static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
1793
+ aclTensor* acl_self, aclTensor* tensor1,
1794
+ aclTensor* tensor2, float value) {
1795
+ uint64_t workspaceSize = 0;
1796
+ aclOpExecutor* executor;
1797
+ void* workspaceAddr = nullptr;
1798
+ aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
1799
+
1800
+ ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
1801
+ acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
1802
+ if (workspaceSize > 0) {
1803
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1804
+ workspaceAddr = workspace_allocator.get();
1805
+ }
1806
+
1807
+ ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
1808
+ ctx.stream()));
1809
+ }
1810
+
1811
+ /**
1812
+ * @brief Matrix division, optionally in-place.
1813
+ *
1814
+ * This function division each element of the source tensor `acl_src` by the
1815
+ * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
1816
+ * If `inplace` is true, `acl_dst` will not be used and the operation is
1817
+ * performed in-place on `acl_src`. The operation is defined as: \f[
1818
+ * \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
1819
+ * \f]
1820
+ *
1821
+ * @param ctx The context for the CANN backend operations.
1822
+ * @param acl_src Numerator tensor..
1823
+ * @param acl_other Denominator tensor.
1824
+ * @param acl_dst The destination tensor where the result will be stored if
1825
+ * `inplace` is false.
1826
+ * @param inplace Flag indicating whether to perform the operation in-place on
1827
+ * `acl_src`.
1828
+ */
1829
+ static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1830
+ aclTensor* acl_other, aclTensor* acl_dst,
1831
+ bool inplace) {
1832
+ uint64_t workspaceSize = 0;
1833
+ aclOpExecutor* executor;
1834
+ void* workspaceAddr = nullptr;
1835
+
1836
+ if (inplace) {
1837
+ ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
1838
+ &workspaceSize, &executor));
1839
+ if (workspaceSize > 0) {
1840
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1841
+ workspaceAddr = workspace_allocator.get();
1842
+ }
1843
+
1844
+ ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
1845
+ ctx.stream()));
1846
+ } else {
1847
+ ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
1848
+ &workspaceSize, &executor));
1849
+ if (workspaceSize > 0) {
1850
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1851
+ workspaceAddr = workspace_allocator.get();
1852
+ }
1853
+
1854
+ ACL_CHECK(
1855
+ aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
1856
+ }
1857
+ }
1858
+
1859
+ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1860
+ ggml_tensor* dst) {
1861
+ const ggml_tensor* src = dst->src[0];
1862
+
1863
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
1864
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1865
+
1866
+ const int dim = dst->op_params[0];
1867
+ const int max_period = dst->op_params[1];
1868
+ int half = dim / 2;
1869
+
1870
+ aclTensor* acl_src = ggml_cann_create_tensor(src);
1871
+
1872
+ // arange: [0, ..., half)
1873
+ float start = 0;
1874
+ float stop = half;
1875
+ float step = 1;
1876
+ int64_t n_elements_arange = half;
1877
+ int64_t tmp_arange_ne[] = {half};
1878
+ size_t tmp_arange_nb[] = {sizeof(dst->type)};
1879
+
1880
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
1881
+ void* tmp_arange_buffer = arange_allocator.get();
1882
+ aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
1883
+ tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1884
+ ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
1885
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1886
+
1887
+ aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
1888
+
1889
+ // freq
1890
+ float freq_param = -logf(max_period) / half;
1891
+ bool inplace = true;
1892
+ aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
1893
+ aclnn_exp(ctx, tmp_arange_tensor);
1894
+
1895
+ // permute: src [0,1,2,3]->[0,1,3,2]
1896
+ int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
1897
+ size_t tmp_permute_nb[GGML_MAX_DIMS];
1898
+ tmp_permute_nb[0] = ggml_type_size(src->type);
1899
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1900
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1901
+ }
1902
+
1903
+ ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1904
+ void* tmp_permute_buffer = permute_allocator.get();
1905
+ aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor(
1906
+ tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1907
+ ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1908
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
1909
+ int64_t permute_dim[] = {0, 1, 3, 2};
1910
+ int64_t num_dims = 4;
1911
+ aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims);
1912
+
1913
+ // timestep * freq
1914
+ int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
1915
+ src->ne[3]};
1916
+ size_t tmp_mul_nb[GGML_MAX_DIMS];
1917
+ tmp_mul_nb[0] = ggml_type_size(src->type);
1918
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1919
+ tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
1920
+ }
1921
+
1922
+ int mul_nelements =
1923
+ src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
1924
+
1925
+ ggml_cann_pool_alloc mul_allocator(
1926
+ ctx.pool(), mul_nelements * ggml_type_size(src->type));
1927
+ void* tmp_mul_buffer = mul_allocator.get();
1928
+ aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
1929
+ tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1930
+ ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1931
+ ACL_FORMAT_ND);
1932
+ aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor);
1933
+
1934
+ // cos
1935
+ ggml_cann_pool_alloc cos_allocator(
1936
+ ctx.pool(), mul_nelements * ggml_type_size(src->type));
1937
+ void* tmp_cos_buffer = cos_allocator.get();
1938
+ aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
1939
+ tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
1940
+ ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1941
+ ACL_FORMAT_ND);
1942
+
1943
+ aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
1944
+
1945
+ // sin
1946
+ ggml_cann_pool_alloc sin_allocator(
1947
+ ctx.pool(), mul_nelements * ggml_type_size(src->type));
1948
+ void* tmp_sin_buffer = sin_allocator.get();
1949
+ aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
1950
+ tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
1951
+ ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1952
+ ACL_FORMAT_ND);
1953
+
1954
+ aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
1955
+
1956
+ // concat
1957
+ int64_t concat_dim = 3;
1958
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1959
+ aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1960
+ aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
1961
+ aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
1962
+
1963
+ // release
1964
+ // segmentation fault when delete both tensorList and his elements.
1965
+ ACL_CHECK(aclDestroyTensorList(tensorList));
1966
+ ACL_CHECK(aclDestroyTensor(acl_src));
1967
+ ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
1968
+ ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
1969
+ ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
1970
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1971
+ }
1972
+
1973
+ /**
1974
+ * @brief Fills a tensor with a scalar value.
1975
+ *
1976
+ * This function fills the destination tensor `acl_dst` with the scalar value
1977
+ * `scalar`.
1978
+ *
1979
+ * @param ctx The context for the CANN backend operations.
1980
+ * @param scalar The scalar value used to fill the tensor.
1981
+ * @param acl_dst The destination tensor to be filled with the scalar value.
1982
+ */
1983
+ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1984
+ aclTensor* acl_dst) {
1985
+ auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1986
+
1987
+ uint64_t workspaceSize = 0;
1988
+ aclOpExecutor* executor;
1989
+ void* workspaceAddr = nullptr;
1990
+
1991
+ ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(
1992
+ acl_dst, acl_scalar, &workspaceSize, &executor));
1993
+ if (workspaceSize > 0) {
1994
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1995
+ workspaceAddr = workspace_allocator.get();
1996
+ }
1997
+
1998
+ ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
1999
+ ctx.stream()));
2000
+ ACL_CHECK(aclDestroyScalar(acl_scalar));
2001
+ }
2002
+
2003
+ /**
2004
+ * @brief Raises each element of a tensor to the power of the corresponding
2005
+ * element in another tensor.
2006
+ *
2007
+ * This function computes the element-wise power of the destination tensor
2008
+ * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
2009
+ * The operation is defined as:
2010
+ * \f[
2011
+ * \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
2012
+ * \f]
2013
+ *
2014
+ * @param ctx The context for the CANN backend operations.
2015
+ * @param acl_dst The destination tensor, which also serves as the base tensor.
2016
+ * @param acl_exp The exponent tensor, each element of which is used to raise
2017
+ * the corresponding element in the destination tensor.
2018
+ */
2019
+ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
2020
+ aclTensor* acl_dst, aclTensor* acl_exp) {
2021
+ uint64_t workspaceSize = 0;
2022
+ aclOpExecutor* executor;
2023
+ void* workspaceAddr = nullptr;
2024
+
2025
+ ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(
2026
+ acl_dst, acl_exp, &workspaceSize, &executor));
2027
+ if (workspaceSize > 0) {
2028
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2029
+ workspaceAddr = workspace_allocator.get();
2030
+ }
2031
+
2032
+ ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
2033
+ executor, ctx.stream()));
2034
+ }
2035
+
2036
+ /**
2037
+ * @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
2038
+ * @details This function implements the Alibi mechanism, which introduces
2039
+ * learnable biases into the attention scores to simulate relative
2040
+ * position encoding without the need for explicit positional
2041
+ * embeddings.
2042
+ *
2043
+ * @param ctx The backend CANN context for executing operations.
2044
+ * @param acl_src The source tensor representing the query or key.
2045
+ * @param acl_position The position tensor containing relative positions.
2046
+ * @param acl_dst The destination tensor where the result will be stored.
2047
+ * @param n_head The number of attention heads.
2048
+ * @param src_ne The dimensions of the source tensor.
2049
+ * @param src_nb0 The byte size of the first dimension of the source
2050
+ tensor.
2051
+ * @param max_bias The maximum bias value used in the Alibi mechanism.
2052
+ * @param dst The destination tensor object for additional metadata.
2053
+ *
2054
+ * The function performs the following steps:
2055
+ * 1. Calculates the logarithm floor of the number of heads to determine the
2056
+ base for bias calculation.
2057
+ * 2. Initializes arrays with arithmetic sequences and fills them with bias
2058
+ values.
2059
+ * 3. Computes the bias tensor based on the calculated biases and arithmetic
2060
+ sequences.
2061
+ * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
2062
+ * 5. Multiplies the position tensor by the bias tensor.
2063
+ * 6. Adds the result of the multiplication to the source tensor to produce the
2064
+ final output.
2065
+ */
2066
+ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2067
+ aclTensor* acl_position, aclTensor* acl_dst,
2068
+ const int n_head, int64_t* src_ne, const size_t src_nb0,
2069
+ float max_bias, ggml_tensor* dst) {
2070
+ const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
2071
+ GGML_ASSERT(src_nb0 == sizeof(float));
2072
+ GGML_ASSERT(n_head == src_ne[2]);
2073
+
2074
+ const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
2075
+
2076
+ float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
2077
+ float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
2078
+
2079
+ // init arange
2080
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(),
2081
+ ne2_ne3 * ggml_type_size(dst->type));
2082
+ void* tmp_arange_buffer = arange_allocator.get();
2083
+
2084
+ // arange1: [1, ..., n_heads_log2_floor+1)
2085
+ float start = 1;
2086
+ float stop = n_heads_log2_floor + 1;
2087
+ float step = 1;
2088
+ int64_t n_elements_arange = n_heads_log2_floor;
2089
+
2090
+ int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
2091
+ size_t tmp_arange1_nb[] = {sizeof(dst->type)};
2092
+ aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
2093
+ tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
2094
+ ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
2095
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
2096
+
2097
+ aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
2098
+
2099
+ aclTensor* tmp_arange2_tensor = nullptr;
2100
+ if (n_heads_log2_floor < ne2_ne3) {
2101
+ // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
2102
+ start = 1;
2103
+ stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
2104
+ step = 2;
2105
+ n_elements_arange = ne2_ne3 - n_heads_log2_floor;
2106
+ int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
2107
+ size_t tmp_arange2_nb[] = {sizeof(dst->type)};
2108
+
2109
+ aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
2110
+ (char*)tmp_arange_buffer +
2111
+ n_heads_log2_floor * ggml_type_size(dst->type),
2112
+ ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
2113
+ tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
2114
+ aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
2115
+ n_elements_arange);
2116
+ }
2117
+
2118
+ // init mk_base
2119
+ ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
2120
+ ne2_ne3 * ggml_type_size(dst->type));
2121
+ void* tmp_mk_base_buffer = mk_base_allocator.get();
2122
+ int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
2123
+ size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
2124
+ aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
2125
+ tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
2126
+ ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
2127
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
2128
+
2129
+ aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
2130
+
2131
+ aclTensor* tmp_mk_base2_tensor = nullptr;
2132
+ if (n_heads_log2_floor < ne2_ne3) {
2133
+ int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
2134
+ size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
2135
+ aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
2136
+ (char*)tmp_mk_base_buffer +
2137
+ n_heads_log2_floor * ggml_type_size(dst->type),
2138
+ ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
2139
+ tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
2140
+ aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
2141
+ }
2142
+
2143
+ // init mk
2144
+ int64_t tmp_mk_base_ne[] = {ne2_ne3};
2145
+ size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
2146
+ aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
2147
+ tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
2148
+ ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
2149
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
2150
+ aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
2151
+ tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
2152
+ ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
2153
+ GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
2154
+ aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
2155
+
2156
+ // reshape mk
2157
+ int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
2158
+ size_t tmp_mk_nb[GGML_MAX_DIMS];
2159
+ tmp_mk_nb[0] = ggml_type_size(dst->type);
2160
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2161
+ tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
2162
+ }
2163
+ aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
2164
+ tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
2165
+ ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
2166
+ ACL_FORMAT_ND);
2167
+
2168
+ // acl_position * mk
2169
+ int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
2170
+ size_t tmp_output_nb[GGML_MAX_DIMS];
2171
+ tmp_output_nb[0] = ggml_type_size(dst->type);
2172
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2173
+ tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
2174
+ }
2175
+ ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
2176
+ void* tmp_output_buffer = output_allocator.get();
2177
+ aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
2178
+ tmp_output_buffer, ggml_cann_type_mapping(dst->type),
2179
+ ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
2180
+ ACL_FORMAT_ND);
2181
+ aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
2182
+
2183
+ // add
2184
+ aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
2185
+
2186
+ ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
2187
+ ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
2188
+ ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
2189
+ ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
2190
+ ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
2191
+ ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
2192
+ ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
2193
+ ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
2194
+ }
2195
+
2196
+ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2197
+ ggml_cann_dup(ctx, dst);
2198
+ }
2199
+
2200
+ /**
2201
+ * @brief Performs element-wise addition of two tensors in place.
2202
+ *
2203
+ * This function adds the source tensor `acl_src` to the destination tensor
2204
+ * `acl_dst` element-wise and stores the result in the destination tensor
2205
+ * `acl_dst`.
2206
+ *
2207
+ * @param ctx The context for the CANN backend operations.
2208
+ * @param acl_src The source tensor to be added.
2209
+ * @param acl_dst The destination tensor which will hold the result of the
2210
+ * addition.
2211
+ */
2212
+ static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
2213
+ aclTensor* acl_src, aclTensor* acl_dst) {
2214
+ aclScalar* alpha = nullptr;
2215
+ float alphaValue = 1.0f;
2216
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2217
+
2218
+ uint64_t workspaceSize = 0;
2219
+ aclOpExecutor* executor;
2220
+ void* workspaceAddr = nullptr;
2221
+
2222
+ ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
2223
+ &workspaceSize, &executor));
2224
+ if (workspaceSize > 0) {
2225
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2226
+ workspaceAddr = workspace_allocator.get();
2227
+ }
2228
+
2229
+ ACL_CHECK(
2230
+ aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
2231
+
2232
+ ACL_CHECK(aclDestroyScalar(alpha));
2233
+ }
2234
+
2235
+ /**
2236
+ * @brief Applies the softmax function to a tensor along a specified dimension.
2237
+ *
2238
+ * This function computes the softmax of the source tensor `acl_src` along the
2239
+ * specified dimension `dim` and stores the result in the destination tensor
2240
+ * `acl_dst`.
2241
+ *
2242
+ * @param ctx The context for the CANN backend operations.
2243
+ * @param acl_src The source tensor on which the softmax function will be
2244
+ * applied.
2245
+ * @param dim The dimension along which the softmax function will be computed.
2246
+ * @param acl_dst The destination tensor where the softmax results will be
2247
+ * stored.
2248
+ */
2249
+ static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2250
+ int64_t dim, aclTensor* acl_dst) {
2251
+ uint64_t workspaceSize = 0;
2252
+ aclOpExecutor* executor;
2253
+ void* workspaceAddr = nullptr;
2254
+
2255
+ ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst,
2256
+ &workspaceSize, &executor));
2257
+
2258
+ if (workspaceSize > 0) {
2259
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2260
+ workspaceAddr = workspace_allocator.get();
2261
+ }
2262
+
2263
+ aclrtStream stream = ctx.stream();
2264
+ ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
2265
+ }
2266
+
2267
+ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2268
+ ggml_tensor* src0 = dst->src[0];
2269
+ ggml_tensor* src1 = dst->src[1]; // mask
2270
+
2271
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
2272
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2273
+
2274
+ float scale = 1.0f;
2275
+ float max_bias = 0.0f;
2276
+
2277
+ memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
2278
+ memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
2279
+
2280
+ // input mul scale
2281
+ aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
2282
+
2283
+ size_t n_bytes = ggml_nbytes(src0);
2284
+ ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
2285
+ void* input_mul_scale_buffer = mul_scale_allocator.get();
2286
+ aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
2287
+ input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
2288
+ src0->nb, GGML_MAX_DIMS);
2289
+
2290
+ bool inplace = false;
2291
+ aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
2292
+
2293
+ // mask
2294
+ aclTensor* acl_src1_fp32_tensor = nullptr;
2295
+ aclTensor* tmp_mask_tensor = nullptr;
2296
+ ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
2297
+ if (src1) {
2298
+ const bool use_f16 = src1->type == GGML_TYPE_F16;
2299
+ if (use_f16) {
2300
+ // cast to fp32
2301
+ size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
2302
+ size_t src1_fp32_nb[GGML_MAX_DIMS];
2303
+ src1_fp32_nb[0] = sizeof(float_t);
2304
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2305
+ src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
2306
+ }
2307
+ src1_fp32_allocator.alloc(n_bytes);
2308
+ void* src1_fp32_buffer = src1_fp32_allocator.get();
2309
+ acl_src1_fp32_tensor = ggml_cann_create_tensor(
2310
+ src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
2311
+ src1_fp32_nb, GGML_MAX_DIMS);
2312
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
2313
+ aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
2314
+
2315
+ ACL_CHECK(aclDestroyTensor(acl_src1));
2316
+ } else {
2317
+ acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
2318
+ }
2319
+
2320
+ // broadcast the mask across rows, only use ne11 of ne01 in mask
2321
+ if (src1->ne[1] != src0->ne[1]) {
2322
+ // mask shape: [1,1,ne11,ne10]
2323
+ int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
2324
+ size_t tmp_mask_nb[GGML_MAX_DIMS];
2325
+ tmp_mask_nb[0] = sizeof(float_t);
2326
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2327
+ tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
2328
+ }
2329
+ tmp_mask_tensor = ggml_cann_create_tensor(
2330
+ src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
2331
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2332
+ }
2333
+
2334
+ // alibi
2335
+ const int n_head = src0->ne[2];
2336
+ const size_t src_nb0 = src0->nb[0];
2337
+
2338
+ n_bytes = ggml_nbytes(dst);
2339
+ ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
2340
+ void* output_buffer = output_allocator.get();
2341
+ aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
2342
+ output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
2343
+ dst->nb, GGML_MAX_DIMS);
2344
+ if (max_bias <= 0.0f) {
2345
+ // slope = 1.0
2346
+ if (tmp_mask_tensor) {
2347
+ aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
2348
+ alibi_output_tensor);
2349
+ } else {
2350
+ aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
2351
+ alibi_output_tensor);
2352
+ }
2353
+ } else {
2354
+ // slope != 1.0
2355
+ if (tmp_mask_tensor) {
2356
+ aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
2357
+ alibi_output_tensor, n_head, src0->ne, src_nb0,
2358
+ max_bias, dst);
2359
+ } else {
2360
+ aclnn_alibi(ctx, acl_input_mul_scale_tensor,
2361
+ acl_src1_fp32_tensor, alibi_output_tensor, n_head,
2362
+ src0->ne, src_nb0, max_bias, dst);
2363
+ }
2364
+ }
2365
+
2366
+ // softmax
2367
+ aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
2368
+ ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
2369
+ } else {
2370
+ aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
2371
+ }
2372
+
2373
+ ACL_CHECK(aclDestroyTensor(acl_src0));
2374
+ ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
2375
+ ACL_CHECK(aclDestroyTensor(acl_dst));
2376
+ ACL_CHECK(aclDestroyScalar(acl_scale));
2377
+ ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
2378
+ ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
2379
+ }
2380
+
2381
+ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2382
+ ggml_tensor* src0 = dst->src[0];
2383
+ ggml_tensor* src1 = dst->src[1];
2384
+
2385
+ ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2386
+ ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2387
+ ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2388
+ src0->extra = src0_extra_allocator.get();
2389
+ src1->extra = src1_extra_allocator.get();
2390
+ dst->extra = dst_extra_allocator.get();
2391
+ ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
2392
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2393
+ ctx.stream()));
2394
+ ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
2395
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2396
+ ctx.stream()));
2397
+ ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
2398
+ sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2399
+ ctx.stream()));
2400
+
2401
+ switch (src0->type) {
2402
+ case GGML_TYPE_F32: {
2403
+ #ifdef ASCEND_310P
2404
+ // Special operation for get_row_f32 kernel of 310P: clear the
2405
+ // content of dest data buffer when row is not aligned to 32 bytes
2406
+ if ((src0->ne[0] % 8) != 0) {
2407
+ size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
2408
+ src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2409
+ ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2410
+ }
2411
+ #endif
2412
+ aclrtlaunch_ascendc_get_row_f32(
2413
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2414
+ ((ggml_tensor*)src0->extra)->ne,
2415
+ ((ggml_tensor*)src0->extra)->nb,
2416
+ ((ggml_tensor*)src1->extra)->ne,
2417
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2418
+ ((ggml_tensor*)dst->extra)->nb);
2419
+ break;
2420
+ }
2421
+ case GGML_TYPE_F16: {
2422
+ #ifdef ASCEND_310P
2423
+ // Special operation for get_row_f16 kernel of 310P: clear the
2424
+ // content of dest data buffer when row is not aligned to 32 bytes
2425
+ if ((src0->ne[0] % 16) != 0) {
2426
+ size_t dst_len =
2427
+ src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
2428
+ ggml_type_size(
2429
+ GGML_TYPE_F32); // out is also f32, even input is f16
2430
+ ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2431
+ }
2432
+ #endif
2433
+ aclrtlaunch_ascendc_get_row_f16(
2434
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2435
+ ((ggml_tensor*)src0->extra)->ne,
2436
+ ((ggml_tensor*)src0->extra)->nb,
2437
+ ((ggml_tensor*)src1->extra)->ne,
2438
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2439
+ ((ggml_tensor*)dst->extra)->nb);
2440
+ break;
2441
+ }
2442
+ case GGML_TYPE_Q4_0:
2443
+ aclrtlaunch_ascendc_get_row_q4_0(
2444
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2445
+ ((ggml_tensor*)src0->extra)->ne,
2446
+ ((ggml_tensor*)src1->extra)->ne,
2447
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2448
+ ((ggml_tensor*)dst->extra)->nb);
2449
+ break;
2450
+ case GGML_TYPE_Q8_0:
2451
+ aclrtlaunch_ascendc_get_row_q8_0(
2452
+ 24, ctx.stream(), src0->data, src1->data, dst->data,
2453
+ ((ggml_tensor*)src0->extra)->ne,
2454
+ ((ggml_tensor*)src1->extra)->ne,
2455
+ ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2456
+ ((ggml_tensor*)dst->extra)->nb);
2457
+ break;
2458
+ default:
2459
+ GGML_ABORT("fatal error");
2460
+ break;
2461
+ }
2462
+ }
2463
+
2464
+ /**
2465
+ * @brief Repeats elements of a tensor along a specified dimension.
2466
+ *
2467
+ * This function repeats each element of the source tensor `acl_src` a specified
2468
+ * number of times (`repeats`) along the specified dimension `dim` and stores
2469
+ * the result in the destination tensor `acl_dst`.
2470
+ *
2471
+ * @param ctx The context for the CANN backend operations.
2472
+ * @param acl_src The source tensor whose elements will be repeated.
2473
+ * @param acl_dst The destination tensor where the repeated elements will be
2474
+ * stored.
2475
+ * @param dim The dimension along which the elements will be repeated.
2476
+ * @param repeats The number of times each element will be repeated.
2477
+ * @param output_size The size of the output tensor.
2478
+ */
2479
+ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
2480
+ aclTensor* acl_src, aclTensor* acl_dst,
2481
+ int64_t dim, int64_t repeats,
2482
+ int64_t output_size) {
2483
+ uint64_t workspaceSize = 0;
2484
+ aclOpExecutor* executor;
2485
+ void* workspaceAddr = nullptr;
2486
+
2487
+ ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(
2488
+ acl_src, repeats, dim, output_size, acl_dst, &workspaceSize,
2489
+ &executor));
2490
+ if (workspaceSize > 0) {
2491
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2492
+ workspaceAddr = workspace_allocator.get();
2493
+ }
2494
+
2495
+ ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize,
2496
+ executor, ctx.stream()));
2497
+ }
2498
+
2499
+ /**
2500
+ * @brief Performs matrix multiplication of two tensors.
2501
+ *
2502
+ * This function computes the matrix multiplication of the input tensor
2503
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2504
+ * destination tensor `acl_dst`.
2505
+ * The operation is defined as:
2506
+ * \f[
2507
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2508
+ * \f]
2509
+ *
2510
+ * @param ctx The context for the CANN backend operations.
2511
+ * @param acl_input The input tensor for the matrix multiplication.
2512
+ * @param acl_weight The weight tensor for the matrix multiplication.
2513
+ * @param acl_dst The destination tensor where the result of the matrix
2514
+ * multiplication will be stored.
2515
+ */
2516
+ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2517
+ aclTensor* acl_weight, aclTensor* acl_dst) {
2518
+ int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
2519
+ // fp32, atlas a2 will transpose it to HFLOAT32.
2520
+ uint64_t workspaceSize = 0;
2521
+ aclOpExecutor* executor;
2522
+ void* workspaceAddr = nullptr;
2523
+
2524
+ ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2525
+ cube_math_type, &workspaceSize,
2526
+ &executor));
2527
+
2528
+ if (workspaceSize > 0) {
2529
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2530
+ workspaceAddr = workspace_allocator.get();
2531
+ }
2532
+
2533
+ ACL_CHECK(
2534
+ aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2535
+ }
2536
+
2537
+ /**
2538
+ * @brief Performs matrix multiplication of two 2D tensors.
2539
+ *
2540
+ * This function computes the matrix multiplication of the input tensor
2541
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2542
+ * destination tensor `acl_dst`.
2543
+ * The operation is defined as:
2544
+ * \f[
2545
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2546
+ * \f]
2547
+ *
2548
+ * @param ctx The context for the CANN backend operations.
2549
+ * @param acl_input The input tensor for the matrix multiplication.
2550
+ * @param acl_weight The weight tensor for the matrix multiplication.
2551
+ * @param acl_dst The destination tensor where the result of the matrix
2552
+ * multiplication will be stored.
2553
+ */
2554
+ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
2555
+ aclTensor* acl_input, aclTensor* acl_weight,
2556
+ aclTensor* acl_dst) {
2557
+ int8_t cube_math_type = 2;
2558
+ uint64_t workspaceSize = 0;
2559
+ aclOpExecutor* executor;
2560
+ void* workspaceAddr = nullptr;
2561
+
2562
+ ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2563
+ cube_math_type, &workspaceSize,
2564
+ &executor));
2565
+
2566
+ if (workspaceSize > 0) {
2567
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2568
+ workspaceAddr = workspace_allocator.get();
2569
+ }
2570
+
2571
+ ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
2572
+ }
2573
+
2574
+ /**
2575
+ * @brief Performs matrix multiplication of two 3D tensors.
2576
+ *
2577
+ * This function computes the matrix multiplication of the input tensor
2578
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2579
+ * destination tensor `acl_dst`.
2580
+ * The operation is defined as:
2581
+ * \f[
2582
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2583
+ * \f]
2584
+ *
2585
+ * @param ctx The context for the CANN backend operations.
2586
+ * @param acl_input The input tensor for the matrix multiplication.
2587
+ * @param acl_weight The weight tensor for the matrix multiplication.
2588
+ * @param acl_dst The destination tensor where the result of the matrix
2589
+ * multiplication will be stored.
2590
+ */
2591
+ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
2592
+ aclTensor* acl_input, aclTensor* acl_weight,
2593
+ aclTensor* acl_dst) {
2594
+ int8_t cube_math_type = 2;
2595
+ uint64_t workspaceSize = 0;
2596
+ aclOpExecutor* executor;
2597
+ void* workspaceAddr = nullptr;
2598
+
2599
+ ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
2600
+ cube_math_type, &workspaceSize,
2601
+ &executor));
2602
+
2603
+ if (workspaceSize > 0) {
2604
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2605
+ workspaceAddr = workspace_allocator.get();
2606
+ }
2607
+
2608
+ ACL_CHECK(
2609
+ aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
2610
+ }
2611
+
2612
+ /**
2613
+ * @brief Performs matrix multiplication with floating-point precision on
2614
+ * tensors using the CANN backend.
2615
+ *
2616
+ * This function performs matrix multiplication of the input tensor and the
2617
+ * weight tensor, handling broadcasting and transposing as needed, and stores
2618
+ * the result in the destination tensor `dst`.
2619
+ *
2620
+ * @param ctx The context for the CANN backend operations.
2621
+ * @param dst The destination tensor where the result of the matrix
2622
+ * multiplication will be stored.
2623
+ */
2624
+ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
2625
+ ggml_tensor* dst) {
2626
+ ggml_tensor* weight = dst->src[0]; // weight
2627
+ ggml_tensor* input = dst->src[1]; // input
2628
+
2629
+ // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
2630
+ // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
2631
+ BCAST_MUL_MAT_SHAPE(input, weight, dst);
2632
+
2633
+ int64_t n_dims = bcast_dims;
2634
+ if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
2635
+ if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
2636
+ n_dims = 2;
2637
+ } else if (bcast_input_ne[2] == 1) {
2638
+ n_dims = 3;
2639
+ }
2640
+ }
2641
+
2642
+ aclTensor* acl_input_tensor =
2643
+ ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
2644
+ int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
2645
+ bcast_weight_ne[2], bcast_weight_ne[3],
2646
+ bcast_weight_ne[4], bcast_weight_ne[5]};
2647
+ size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
2648
+ bcast_weight_nb[2], bcast_weight_nb[3],
2649
+ bcast_weight_nb[4], bcast_weight_nb[5]};
2650
+ aclTensor* acl_weight_tensor =
2651
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
2652
+ aclTensor* acl_dst =
2653
+ ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
2654
+
2655
+ switch (n_dims) {
2656
+ case 2:
2657
+ aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2658
+ break;
2659
+ case 3:
2660
+ aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2661
+ break;
2662
+ default:
2663
+ aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2664
+ break;
2665
+ }
2666
+
2667
+ ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2668
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2669
+ ACL_CHECK(aclDestroyTensor(acl_dst));
2670
+ }
2671
+
2672
+ /**
2673
+ * @brief Performs matrix multiplication with quantized weights and
2674
+ * floating-point inputs using the CANN backend.
2675
+ *
2676
+ * This function performs matrix multiplication of the input tensor `src1` and
2677
+ * the weight tensor `src0`, handling broadcasting, transposing, and
2678
+ * quantization as needed, and stores the result in the destination tensor
2679
+ * `dst`.
2680
+ *
2681
+ * @param ctx The context for the CANN backend operations.
2682
+ * @param dst The destination tensor where the result of the matrix
2683
+ * multiplication will be stored.
2684
+ */
2685
+ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2686
+ ggml_tensor* dst,
2687
+ const enum ggml_type type) {
2688
+ ggml_tensor* src0 = dst->src[0]; // weight
2689
+ ggml_tensor* src1 = dst->src[1]; // input
2690
+
2691
+ // The shape of the weight is NCHW.
2692
+ // Matrix multiplication uses HW dims.
2693
+ // HC is regarded as batch.
2694
+ // weight need transpose.
2695
+ float weight_elem_size;
2696
+ if (type == GGML_TYPE_Q4_0) {
2697
+ weight_elem_size = float(sizeof(uint8_t)) / 2;
2698
+ } else if (type == GGML_TYPE_Q8_0) {
2699
+ weight_elem_size = float(sizeof(uint8_t));
2700
+ } else {
2701
+ GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2702
+ }
2703
+ float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
2704
+ size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
2705
+ size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2706
+
2707
+ // scale stored at the end of weight. Also need transpose.
2708
+ size_t scale_elem_size = sizeof(uint16_t);
2709
+ size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
2710
+ scale_elem_size};
2711
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2712
+ char* scale_offset = (char*)src0->data + weight_size;
2713
+
2714
+ // input
2715
+ size_t input_elem_size = sizeof(uint16_t);
2716
+ int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
2717
+ size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
2718
+ size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
2719
+ ggml_cann_pool_alloc input_alloctor(ctx.pool());
2720
+ void* input_buffer = src1->data;
2721
+
2722
+ // case in
2723
+ if (src1->type != GGML_TYPE_F16) {
2724
+ aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
2725
+ input_buffer =
2726
+ input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
2727
+
2728
+ int64_t* input_cast_ne = src1->ne;
2729
+ size_t input_cast_nb[GGML_MAX_DIMS];
2730
+ input_cast_nb[0] = sizeof(uint16_t);
2731
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2732
+ input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
2733
+ }
2734
+
2735
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2736
+ input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
2737
+ input_cast_nb, GGML_MAX_DIMS);
2738
+ aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
2739
+
2740
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2741
+ ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
2742
+ }
2743
+
2744
+ // output
2745
+ size_t output_elem_size = sizeof(uint16_t);
2746
+ size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
2747
+ ggml_cann_pool_alloc output_allocator(ctx.pool());
2748
+ void* output_buffer =
2749
+ output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
2750
+ size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
2751
+
2752
+ // aclnn
2753
+ int64_t max_elem_size = 65535;
2754
+ int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
2755
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool());
2756
+ aclOpExecutor* executor = nullptr;
2757
+ uint64_t workspaceSize = 0;
2758
+ void* workspaceAddr = nullptr;
2759
+ for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2760
+ for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
2761
+ int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
2762
+ int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
2763
+
2764
+ int64_t batch1 = (n1 * src1->ne[2]) + c1;
2765
+ int64_t batch0 = (n0 * src0->ne[2]) + c0;
2766
+
2767
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2768
+ (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2769
+ input_elem_size, input_ne, input_nb, 2);
2770
+
2771
+ // first split
2772
+ int64_t weight_ne_offset = 0;
2773
+ int64_t weight_ne[2] = {
2774
+ max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
2775
+ src0->ne[0]};
2776
+ int64_t scale_ne_offset = 0;
2777
+ int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
2778
+ int64_t output_ne_offset = 0;
2779
+ int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
2780
+
2781
+ aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2782
+ (char*)src0->data + batch0 * weight_stride,
2783
+ ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2784
+ weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2785
+ aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2786
+ scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2787
+ scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
2788
+ scale_ne_offset);
2789
+ aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2790
+ (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2791
+ output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
2792
+ output_ne_offset);
2793
+
2794
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2795
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2796
+ nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
2797
+ &workspaceSize, &executor));
2798
+ if (workspaceAddr == nullptr) {
2799
+ workspaceAddr = workspace_allocator.alloc(workspaceSize);
2800
+ }
2801
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2802
+ workspaceAddr, workspaceSize, executor, ctx.stream()));
2803
+
2804
+ ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2805
+ ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2806
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2807
+
2808
+ // other splits
2809
+ for (int64_t split = 1; split < split_size; split++) {
2810
+ weight_ne_offset +=
2811
+ weight_elem_size * weight_ne[0] * weight_ne[1];
2812
+ weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
2813
+ ? src0->ne[1] - (max_elem_size * split)
2814
+ : max_elem_size;
2815
+ scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
2816
+ scale_ne[0] = weight_ne[0];
2817
+ output_ne_offset +=
2818
+ output_elem_size * output_ne[0] * output_ne[1];
2819
+ output_ne[0] = weight_ne[0];
2820
+
2821
+ acl_weight_tensor = ggml_cann_create_tensor(
2822
+ (char*)src0->data + batch0 * weight_stride,
2823
+ ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2824
+ weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2825
+ acl_scale_tensor = ggml_cann_create_tensor(
2826
+ scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2827
+ scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
2828
+ scale_ne_offset);
2829
+ acl_output_tensor = ggml_cann_create_tensor(
2830
+ (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2831
+ output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
2832
+ output_ne_offset);
2833
+
2834
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2835
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2836
+ nullptr, nullptr, nullptr, nullptr, QK8_0,
2837
+ acl_output_tensor, &workspaceSize, &executor));
2838
+ ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
2839
+ workspaceAddr, workspaceSize, executor, ctx.stream()));
2840
+
2841
+ ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
2842
+ ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
2843
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2844
+ }
2845
+
2846
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
2847
+ }
2848
+ }
2849
+
2850
+ // cast out
2851
+ if (dst->type != GGML_TYPE_F16) {
2852
+ int64_t* output_cast_ne = dst->ne;
2853
+ size_t output_cast_nb[GGML_MAX_DIMS];
2854
+ output_cast_nb[0] = sizeof(uint16_t);
2855
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2856
+ output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
2857
+ }
2858
+
2859
+ aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2860
+ output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
2861
+ output_cast_nb, GGML_MAX_DIMS);
2862
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2863
+ aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
2864
+ ggml_cann_type_mapping(dst->type));
2865
+
2866
+ ACL_CHECK(aclDestroyTensor(acl_output_tensor));
2867
+ ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
2868
+ }
2869
+ }
2870
+
2871
+ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2872
+ const enum ggml_type type = dst->src[0]->type;
2873
+ switch (type) {
2874
+ case GGML_TYPE_F32:
2875
+ case GGML_TYPE_F16:
2876
+ ggml_cann_mat_mul_fp(ctx, dst);
2877
+ break;
2878
+ case GGML_TYPE_Q4_0:
2879
+ case GGML_TYPE_Q8_0:
2880
+ ggml_cann_mul_mat_quant(ctx, dst, type);
2881
+ break;
2882
+ default:
2883
+ GGML_ABORT("fatal error");
2884
+ break;
2885
+ }
2886
+ }
2887
+
2888
+ /**
2889
+ * @brief Rolls the elements of a tensor along a specified dimension.
2890
+ *
2891
+ * This function rolls the elements of the source tensor `acl_src` by the
2892
+ * specified shifts `shifts` along the specified dimensions `dims`, and stores
2893
+ * the result in the destination tensor `acl_dst`.
2894
+ *
2895
+ * @param ctx The context for the CANN backend operations.
2896
+ * @param acl_src The source tensor whose elements will be rolled.
2897
+ * @param acl_dst The destination tensor where the rolled elements will be
2898
+ * stored.
2899
+ * @param shifts An array specifying the number of positions by which elements
2900
+ * are shifted.
2901
+ * @param dims An array specifying the dimensions along which elements are
2902
+ * shifted.
2903
+ */
2904
+ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2905
+ aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2906
+ aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2907
+ aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2908
+
2909
+ uint64_t workspaceSize = 0;
2910
+ aclOpExecutor* executor;
2911
+ void* workspaceAddr = nullptr;
2912
+
2913
+ ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst,
2914
+ &workspaceSize, &executor));
2915
+ if (workspaceSize > 0) {
2916
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2917
+ workspaceAddr = workspace_allocator.get();
2918
+ }
2919
+
2920
+ ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream()));
2921
+
2922
+ ACL_CHECK(aclDestroyIntArray(acl_shifts));
2923
+ ACL_CHECK(aclDestroyIntArray(acl_dims));
2924
+ }
2925
+
2926
+ /**
2927
+ * @brief Fills specified positions of a tensor with a scalar value.
2928
+ *
2929
+ * This function fills the positions in the source tensor `acl_src` specified by
2930
+ * `index` along the dimension `dim` with the scalar value `value`.
2931
+ *
2932
+ * @param ctx The context for the CANN backend operations.
2933
+ * @param acl_src The source tensor where the positions will be filled.
2934
+ * @param dim The dimension along which the positions are specified.
2935
+ * @param index An array specifying the positions to be filled.
2936
+ * @param index_num The number of positions specified in the index array.
2937
+ * @param value The scalar value used to fill the specified positions.
2938
+ */
2939
+ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2940
+ aclTensor* acl_src, int64_t dim,
2941
+ int64_t* index, int64_t index_num,
2942
+ float value) {
2943
+ aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2944
+ aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2945
+
2946
+ uint64_t workspaceSize = 0;
2947
+ aclOpExecutor* executor;
2948
+ void* workspaceAddr = nullptr;
2949
+
2950
+ ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(
2951
+ acl_src, dim, acl_index, acl_value, &workspaceSize, &executor));
2952
+ if (workspaceSize > 0) {
2953
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
2954
+ workspaceAddr = workspace_allocator.get();
2955
+ }
2956
+
2957
+ ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize,
2958
+ executor, ctx.stream()));
2959
+
2960
+ ACL_CHECK(aclDestroyIntArray(acl_index));
2961
+ ACL_CHECK(aclDestroyScalar(acl_value));
2962
+ }
2963
+
2964
+ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2965
+ aclTensor* acl_cos_repeat_tensor,
2966
+ aclTensor* acl_sin_repeat_tensor,
2967
+ float theta_scale, float freq_scale,
2968
+ float attn_factor, bool is_neox) {
2969
+ // int sin/cos cache, cache has different repeat method depond on
2970
+ // @param.is_neox
2971
+
2972
+ ggml_tensor* src0 = dst->src[0]; // input
2973
+ ggml_tensor* src1 = dst->src[1]; // position
2974
+ ggml_tensor* src2 = dst->src[2]; // freq_factors
2975
+
2976
+ // arange, [0,1,...,ne0/2]
2977
+ int64_t arange_length = src0->ne[0] / 2;
2978
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(),
2979
+ arange_length * sizeof(float_t));
2980
+ void* arange_buffer = arange_allocator.get();
2981
+ int64_t arange_ne[] = {arange_length, 1, 1, 1};
2982
+ size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
2983
+ arange_length * sizeof(float_t)};
2984
+
2985
+ aclTensor* acl_arange_tensor =
2986
+ ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t),
2987
+ arange_ne, arange_nb, GGML_MAX_DIMS);
2988
+ float start = 0;
2989
+ float step = 1;
2990
+ float stop = src0->ne[0] / 2;
2991
+ float n_elements = src0->ne[0] / 2;
2992
+ aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements);
2993
+
2994
+ // power
2995
+ // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
2996
+ // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
2997
+ // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2998
+ // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
2999
+ // acl_power_tensor);
3000
+ ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
3001
+ arange_length * sizeof(float_t));
3002
+ void* theta_scale_buffer = theta_scale_allocator.get();
3003
+ aclTensor* acl_theta_scale_tensor = aclnn_values(
3004
+ ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
3005
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
3006
+ aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
3007
+
3008
+ // freq_scale
3009
+ if (freq_scale != 1) {
3010
+ aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
3011
+ }
3012
+
3013
+ // freq_factors
3014
+ if (src2) {
3015
+ aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
3016
+ src2->data, ggml_cann_type_mapping(src2->type),
3017
+ ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
3018
+ aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
3019
+ nullptr, true);
3020
+ ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
3021
+ }
3022
+
3023
+ // position
3024
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
3025
+ int64_t position_length = src1->ne[0];
3026
+ int64_t position_ne[] = {1, position_length, 1, 1};
3027
+ size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t),
3028
+ sizeof(int32_t) * position_length,
3029
+ sizeof(int32_t) * position_length};
3030
+ aclTensor* acl_position_tensor = ggml_cann_create_tensor(
3031
+ src1->data, ggml_cann_type_mapping(src1->type),
3032
+ ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
3033
+
3034
+ // power * position
3035
+ int64_t theta_length = arange_length * position_length;
3036
+ ggml_cann_pool_alloc theta_allocator(ctx.pool(),
3037
+ theta_length * sizeof(float_t));
3038
+ void* theta_buffer = theta_allocator.get();
3039
+ int64_t theta_ne[] = {arange_length, position_length, 1, 1};
3040
+ size_t theta_nb[GGML_MAX_DIMS];
3041
+ theta_nb[0] = sizeof(float_t);
3042
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3043
+ theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
3044
+ }
3045
+ aclTensor* acl_theta_tensor =
3046
+ ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
3047
+ theta_ne, theta_nb, GGML_MAX_DIMS);
3048
+ aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
3049
+ acl_theta_tensor);
3050
+
3051
+ // permute: [0,1,2,3]->[0,2,1,3]
3052
+ int64_t permute_ne[] = {arange_length, 1, position_length, 1};
3053
+ size_t permute_nb[GGML_MAX_DIMS];
3054
+ permute_nb[0] = sizeof(float_t);
3055
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3056
+ permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1];
3057
+ }
3058
+ ggml_cann_pool_alloc permute_allocator(ctx.pool(),
3059
+ theta_length * sizeof(float_t));
3060
+ void* permute_buffer = permute_allocator.get();
3061
+ aclTensor* acl_permute_tensor = ggml_cann_create_tensor(
3062
+ permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
3063
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
3064
+ int64_t permute_dim[] = {0, 2, 1, 3};
3065
+ int64_t num_dims = 4;
3066
+ aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
3067
+ num_dims);
3068
+
3069
+ // sin/cos
3070
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(),
3071
+ theta_length * sizeof(float_t));
3072
+ void* sin_buffer = sin_allocator.get();
3073
+ aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
3074
+ sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
3075
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
3076
+ aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor);
3077
+
3078
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(),
3079
+ theta_length * sizeof(float_t));
3080
+ void* cos_buffer = cos_allocator.get();
3081
+ aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
3082
+ cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
3083
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
3084
+ aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
3085
+
3086
+ // attn_factor
3087
+ if (attn_factor != 1) {
3088
+ aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
3089
+ aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
3090
+ }
3091
+
3092
+ // repeat
3093
+ if (is_neox) {
3094
+ int64_t repeatsArray[] = {1, 1, 1, 2};
3095
+ aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
3096
+ aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
3097
+ } else {
3098
+ int64_t num_repeats = 2;
3099
+ int64_t dim = 3;
3100
+ int64_t output_size = arange_length * num_repeats;
3101
+ aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
3102
+ num_repeats, output_size);
3103
+ aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
3104
+ num_repeats, output_size);
3105
+ }
3106
+
3107
+ // release
3108
+ ACL_CHECK(aclDestroyTensor(acl_arange_tensor));
3109
+ ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor));
3110
+ ACL_CHECK(aclDestroyTensor(acl_position_tensor));
3111
+ ACL_CHECK(aclDestroyTensor(acl_theta_tensor));
3112
+ ACL_CHECK(aclDestroyTensor(acl_permute_tensor));
3113
+ ACL_CHECK(aclDestroyTensor(acl_sin_tensor));
3114
+ ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
3115
+ }
3116
+
3117
+ #ifdef __cplusplus
3118
+ extern "C" {
3119
+ #endif
3120
+ aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
3121
+ const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
3122
+ int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
3123
+ aclOpExecutor** executor);
3124
+ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
3125
+ uint64_t workspaceSize,
3126
+ aclOpExecutor* executor,
3127
+ aclrtStream stream);
3128
+ #ifdef __cplusplus
3129
+ }
3130
+ #endif
3131
+
3132
+ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3133
+ // TODO: use ascendc
3134
+ // Only test with LLAMA model.
3135
+ ggml_tensor* src0 = dst->src[0]; // input
3136
+ ggml_tensor* src2 = dst->src[2]; // freq_factors
3137
+
3138
+ // param
3139
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
3140
+ // const int n_past = ((int32_t *) dst->op_params)[0];
3141
+ const int n_dims = ((int32_t*)dst->op_params)[1];
3142
+ const int mode = ((int32_t*)dst->op_params)[2];
3143
+ // const int n_ctx = ((int32_t *) dst->op_params)[3];
3144
+ const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
3145
+
3146
+ GGML_TENSOR_UNARY_OP_LOCALS
3147
+
3148
+ memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
3149
+ memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
3150
+ memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
3151
+ memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
3152
+ memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
3153
+ memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
3154
+
3155
+ // TODO: n_dims <= ne0
3156
+ GGML_ASSERT(n_dims == ne0);
3157
+ GGML_ASSERT(n_dims % 2 == 0);
3158
+ // TODO: ext_factor != 0
3159
+ GGML_ASSERT(ext_factor == 0);
3160
+
3161
+ const float theta_scale = powf(freq_base, -2.0f / n_dims);
3162
+
3163
+ float corr_dims[2];
3164
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
3165
+ beta_slow, corr_dims);
3166
+
3167
+ const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
3168
+
3169
+ // init cos/sin cache
3170
+ ggml_cann_pool_alloc sin_allocator(
3171
+ ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
3172
+ ggml_cann_pool_alloc cos_allocator(
3173
+ ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
3174
+ void* sin_buffer = sin_allocator.get();
3175
+ void* cos_buffer = cos_allocator.get();
3176
+
3177
+ int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
3178
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
3179
+ sin_reshape_nb[0] = sizeof(float_t);
3180
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3181
+ sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
3182
+ }
3183
+ aclTensor* acl_sin_reshape_tensor =
3184
+ ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t),
3185
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
3186
+ aclTensor* acl_cos_reshape_tensor =
3187
+ ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
3188
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
3189
+ aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
3190
+ theta_scale, freq_scale, attn_factor, is_neox);
3191
+
3192
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
3193
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
3194
+
3195
+ #ifdef ASCEND_310P
3196
+ // Special ROPE operation for 310P
3197
+
3198
+ // roll input
3199
+ void* input_roll_buffer;
3200
+ aclTensor* acl_minus_one_tensor;
3201
+ void* minus_one_scale_buffer = nullptr;
3202
+ ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
3203
+ ggml_cann_pool_alloc minus_one_scale_allocator(
3204
+ ctx.pool(), sizeof(float_t) * src0->ne[0]);
3205
+ if (!is_neox) {
3206
+ // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
3207
+ input_roll_buffer = roll_allocator.get();
3208
+ int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
3209
+ src0->ne[2], src0->ne[3]};
3210
+ size_t input_roll_nb[GGML_MAX_DIMS];
3211
+ input_roll_nb[0] = ggml_type_size(src0->type);
3212
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3213
+ input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
3214
+ }
3215
+ aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
3216
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
3217
+ ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
3218
+ GGML_MAX_DIMS);
3219
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
3220
+ src0->data, ggml_cann_type_mapping(src0->type),
3221
+ ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
3222
+ GGML_MAX_DIMS);
3223
+
3224
+ int64_t shifts[] = {1};
3225
+ int64_t dims[] = {3};
3226
+ aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
3227
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
3228
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
3229
+
3230
+ // init [-1, 1, -1, 1, ...]
3231
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
3232
+
3233
+ int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
3234
+ size_t minus_one_nb[GGML_MAX_DIMS];
3235
+ minus_one_nb[0] = sizeof(float_t);
3236
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3237
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
3238
+ }
3239
+ acl_minus_one_tensor = aclnn_values(
3240
+ ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
3241
+ minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
3242
+ int64_t dim = 3;
3243
+ int64_t* index = new int64_t[src0->ne[0]];
3244
+ for (int i = 0; i < src0->ne[0]; i++) {
3245
+ index[i] = i / 2 * 2;
3246
+ }
3247
+ int64_t index_num = src0->ne[0];
3248
+ float value = -1;
3249
+ aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
3250
+ index_num, value);
3251
+ } else {
3252
+ // roll input: [q0,q1,q2,...] ->
3253
+ // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
3254
+ input_roll_buffer = roll_allocator.get();
3255
+ aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
3256
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
3257
+ ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
3258
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
3259
+
3260
+ int64_t shifts[] = {src0->ne[0] / 2};
3261
+ int64_t dims[] = {3};
3262
+ aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
3263
+
3264
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
3265
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
3266
+ // init [-1, -1, -1, 1, 1,1,...]
3267
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
3268
+ int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
3269
+ size_t minus_one_nb[GGML_MAX_DIMS];
3270
+ minus_one_nb[0] = sizeof(float_t);
3271
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3272
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
3273
+ }
3274
+ acl_minus_one_tensor = aclnn_values(
3275
+ ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
3276
+ minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
3277
+ // -1 * first half
3278
+ int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
3279
+ size_t first_half_nb[GGML_MAX_DIMS];
3280
+ first_half_nb[0] = sizeof(float_t);
3281
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3282
+ first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
3283
+ }
3284
+ aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
3285
+ minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
3286
+ first_half_nb, GGML_MAX_DIMS);
3287
+ bool inplace = true;
3288
+ float scale = -1;
3289
+ aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
3290
+ ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
3291
+ }
3292
+
3293
+ // TODO: n_dims < ne0
3294
+ GGML_ASSERT(n_dims == src0->ne[0]);
3295
+
3296
+ // input * scale
3297
+ ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
3298
+ ggml_nbytes(src0));
3299
+ void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
3300
+ size_t input_nb[GGML_MAX_DIMS];
3301
+ input_nb[0] = ggml_type_size(src0->type);
3302
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3303
+ input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
3304
+ }
3305
+ aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
3306
+ input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
3307
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
3308
+ aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
3309
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
3310
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
3311
+
3312
+ aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
3313
+ acl_input_roll_mul_scale_tensor);
3314
+
3315
+ // output
3316
+ void* output_fp32_buffer;
3317
+ if (src0->type == GGML_TYPE_F32) {
3318
+ aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
3319
+ aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
3320
+ acl_sin_reshape_tensor);
3321
+ aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
3322
+ // TODO: ne0 != n_dims in mode2
3323
+ } else if (src0->type == GGML_TYPE_F16) {
3324
+ size_t input_fp32_nb[GGML_MAX_DIMS];
3325
+ input_fp32_nb[0] = sizeof(float_t);
3326
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3327
+ input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
3328
+ }
3329
+ ggml_cann_pool_alloc fp32_allocator1(
3330
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3331
+ void* input_fp32_buffer1 = fp32_allocator1.get();
3332
+ aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
3333
+ input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
3334
+ input_fp32_nb, GGML_MAX_DIMS);
3335
+ ggml_cann_pool_alloc fp32_allocator2(
3336
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3337
+ void* input_fp32_buffer2 = fp32_allocator2.get();
3338
+ aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
3339
+ input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
3340
+ input_fp32_nb, GGML_MAX_DIMS);
3341
+
3342
+ ggml_cann_pool_alloc fp32_allocator(
3343
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
3344
+ output_fp32_buffer = fp32_allocator.get();
3345
+ aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
3346
+ output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
3347
+ input_fp32_nb, GGML_MAX_DIMS);
3348
+ aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
3349
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
3350
+ input_fp32_tensor2);
3351
+ aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
3352
+ output_fp32_tensor);
3353
+ aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
3354
+
3355
+ ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
3356
+ ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
3357
+ ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
3358
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
3359
+ ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
3360
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
3361
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
3362
+ ACL_CHECK(aclDestroyTensor(acl_src));
3363
+ }
3364
+ return;
3365
+ #endif
3366
+
3367
+ // src0 == GGML_TYPE_F16
3368
+ // TODO: optimization this `if` code
3369
+ if (src0->type == GGML_TYPE_F16) {
3370
+ ggml_cann_pool_alloc sin_final_allocator(
3371
+ ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
3372
+ ggml_cann_pool_alloc cos_final_allocator(
3373
+ ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
3374
+ void* sin_final_buffer = sin_final_allocator.get();
3375
+ void* cos_final_buffer = cos_final_allocator.get();
3376
+
3377
+ int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
3378
+ size_t sin_final_nb[GGML_MAX_DIMS];
3379
+ sin_final_nb[0] = ggml_type_size(src0->type);
3380
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3381
+ sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
3382
+ }
3383
+ aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
3384
+ sin_final_buffer, ggml_cann_type_mapping(src0->type),
3385
+ ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
3386
+ GGML_MAX_DIMS);
3387
+ aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
3388
+ cos_final_buffer, ggml_cann_type_mapping(src0->type),
3389
+ ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
3390
+ GGML_MAX_DIMS);
3391
+
3392
+ aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
3393
+ ggml_cann_type_mapping(src0->type));
3394
+ aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
3395
+ ggml_cann_type_mapping(src0->type));
3396
+ ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
3397
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
3398
+ acl_sin_reshape_tensor = acl_sin_final_tensor;
3399
+ acl_cos_reshape_tensor = acl_cos_final_tensor;
3400
+ }
3401
+
3402
+ uint64_t workspaceSize = 0;
3403
+ aclOpExecutor* executor;
3404
+
3405
+ void* workspaceAddr = nullptr;
3406
+
3407
+ int acl_mode = mode;
3408
+ if (mode == 0) {
3409
+ acl_mode = 1;
3410
+ }
3411
+
3412
+ ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
3413
+ acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
3414
+ acl_dst, &workspaceSize, &executor));
3415
+ if (workspaceSize > 0) {
3416
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
3417
+ workspaceAddr = workspace_allocator.get();
3418
+ }
3419
+
3420
+ ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
3421
+ executor, ctx.stream()));
3422
+
3423
+ ACL_CHECK(aclDestroyTensor(acl_src));
3424
+ ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
3425
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
3426
+ ACL_CHECK(aclDestroyTensor(acl_dst));
3427
+ }