whispercpp 1.2.0.2 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/LICENSE +1 -1
- data/README.md +165 -434
- data/Rakefile +46 -86
- data/ext/.gitignore +13 -0
- data/ext/cpu.mk +9 -0
- data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
- data/ext/extconf.rb +185 -7
- data/ext/ggml/include/ggml-alloc.h +76 -0
- data/ext/ggml/include/ggml-backend.h +352 -0
- data/ext/ggml/include/ggml-blas.h +25 -0
- data/ext/ggml/include/ggml-cann.h +123 -0
- data/ext/ggml/include/ggml-cpp.h +38 -0
- data/ext/ggml/include/ggml-cpu.h +135 -0
- data/ext/ggml/include/ggml-cuda.h +47 -0
- data/ext/ggml/include/ggml-kompute.h +50 -0
- data/ext/ggml/include/ggml-metal.h +66 -0
- data/ext/ggml/include/ggml-opencl.h +26 -0
- data/ext/ggml/include/ggml-opt.h +216 -0
- data/ext/ggml/include/ggml-rpc.h +28 -0
- data/ext/ggml/include/ggml-sycl.h +49 -0
- data/ext/ggml/include/ggml-vulkan.h +31 -0
- data/ext/ggml/include/ggml.h +2285 -0
- data/ext/ggml/src/ggml-alloc.c +1037 -0
- data/ext/ggml/src/ggml-amx/common.h +94 -0
- data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
- data/ext/ggml/src/ggml-amx/mmq.h +17 -0
- data/ext/ggml/src/ggml-backend-impl.h +256 -0
- data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
- data/ext/ggml/src/ggml-backend.cpp +1999 -0
- data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
- data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
- data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
- data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
- data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- data/ext/ggml/src/ggml-cann/common.h +286 -0
- data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
- data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
- data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
- data/ext/ggml/src/ggml-common.h +1853 -0
- data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
- data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
- data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
- data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
- data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- data/ext/ggml/src/ggml-impl.h +556 -0
- data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
- data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
- data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
- data/ext/ggml/src/ggml-opt.cpp +854 -0
- data/ext/ggml/src/ggml-quants.c +5238 -0
- data/ext/ggml/src/ggml-quants.h +100 -0
- data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
- data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
- data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
- data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
- data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
- data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
- data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
- data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
- data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
- data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
- data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
- data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
- data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
- data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- data/ext/ggml/src/ggml-threading.cpp +12 -0
- data/ext/ggml/src/ggml-threading.h +14 -0
- data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
- data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- data/ext/ggml/src/ggml.c +7694 -0
- data/ext/include/whisper.h +672 -0
- data/ext/metal-embed.mk +17 -0
- data/ext/metal.mk +6 -0
- data/ext/ruby_whisper.cpp +1608 -159
- data/ext/ruby_whisper.h +10 -0
- data/ext/scripts/get-flags.mk +38 -0
- data/ext/src/coreml/whisper-decoder-impl.h +146 -0
- data/ext/src/coreml/whisper-decoder-impl.m +201 -0
- data/ext/src/coreml/whisper-encoder-impl.h +142 -0
- data/ext/src/coreml/whisper-encoder-impl.m +197 -0
- data/ext/src/coreml/whisper-encoder.h +26 -0
- data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
- data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
- data/ext/src/whisper.cpp +7393 -0
- data/extsources.rb +6 -0
- data/lib/whisper/model/uri.rb +157 -0
- data/lib/whisper.rb +2 -0
- data/tests/helper.rb +7 -0
- data/tests/jfk_reader/.gitignore +5 -0
- data/tests/jfk_reader/extconf.rb +3 -0
- data/tests/jfk_reader/jfk_reader.c +68 -0
- data/tests/test_callback.rb +160 -0
- data/tests/test_error.rb +20 -0
- data/tests/test_model.rb +71 -0
- data/tests/test_package.rb +31 -0
- data/tests/test_params.rb +160 -0
- data/tests/test_segment.rb +83 -0
- data/tests/test_whisper.rb +211 -123
- data/whispercpp.gemspec +36 -0
- metadata +137 -11
- data/ext/ggml.c +0 -8616
- data/ext/ggml.h +0 -748
- data/ext/whisper.cpp +0 -4829
- data/ext/whisper.h +0 -402
@@ -0,0 +1,3427 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2023-2024 The ggml authors
|
3
|
+
*
|
4
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
* of this software and associated documentation files (the "Software"), to
|
6
|
+
* deal in the Software without restriction, including without limitation the
|
7
|
+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
8
|
+
* sell copies of the Software, and to permit persons to whom the Software is
|
9
|
+
* furnished to do so, subject to the following conditions:
|
10
|
+
*
|
11
|
+
* The above copyright notice and this permission notice shall be included in
|
12
|
+
* all copies or substantial portions of the Software.
|
13
|
+
*
|
14
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
19
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
20
|
+
* IN THE SOFTWARE.
|
21
|
+
*/
|
22
|
+
|
23
|
+
#include "aclnn_ops.h"
|
24
|
+
|
25
|
+
#include <aclnnop/aclnn_addcdiv.h>
|
26
|
+
#include <aclnnop/aclnn_avgpool2d.h>
|
27
|
+
#include <aclnnop/aclnn_batch_matmul.h>
|
28
|
+
#include <aclnnop/aclnn_cast.h>
|
29
|
+
#include <aclnnop/aclnn_constant_pad_nd.h>
|
30
|
+
#include <aclnnop/aclnn_copy.h>
|
31
|
+
#include <aclnnop/aclnn_cos.h>
|
32
|
+
#include <aclnnop/aclnn_div.h>
|
33
|
+
#include <aclnnop/aclnn_exp.h>
|
34
|
+
#include <aclnnop/aclnn_fill_scalar.h>
|
35
|
+
#include <aclnnop/aclnn_group_norm.h>
|
36
|
+
#include <aclnnop/aclnn_index_fill_tensor.h>
|
37
|
+
#include <aclnnop/aclnn_layer_norm.h>
|
38
|
+
#include <aclnnop/aclnn_matmul.h>
|
39
|
+
#include <aclnnop/aclnn_max_pool.h>
|
40
|
+
#include <aclnnop/aclnn_mm.h>
|
41
|
+
#include <aclnnop/aclnn_permute.h>
|
42
|
+
#include <aclnnop/aclnn_pow_tensor_tensor.h>
|
43
|
+
#include <aclnnop/aclnn_reduce_sum.h>
|
44
|
+
#include <aclnnop/aclnn_repeat.h>
|
45
|
+
#include <aclnnop/aclnn_repeat_interleave.h>
|
46
|
+
#include <aclnnop/aclnn_roll.h>
|
47
|
+
#include <aclnnop/aclnn_sin.h>
|
48
|
+
#include <aclnnop/aclnn_softmax.h>
|
49
|
+
#include <aclnnop/aclnn_tril.h>
|
50
|
+
#include <aclnnop/aclnn_triu.h>
|
51
|
+
#include <aclnnop/aclnn_upsample_nearest_2d.h>
|
52
|
+
#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
|
53
|
+
#include <float.h>
|
54
|
+
|
55
|
+
#include <cmath>
|
56
|
+
#include <cstring>
|
57
|
+
#include <exception>
|
58
|
+
#include <vector>
|
59
|
+
|
60
|
+
#include "ggml-impl.h"
|
61
|
+
#include "kernels/ascendc_kernels.h"
|
62
|
+
|
63
|
+
#define GGML_COMMON_DECL_C
|
64
|
+
|
65
|
+
#include "../ggml-common.h"
|
66
|
+
|
67
|
+
/**
|
68
|
+
* @brief Repeats elements of a tensor along each dimension according to the
|
69
|
+
* specified repeat array.
|
70
|
+
*
|
71
|
+
* @param ctx The context for the CANN backend operations.
|
72
|
+
* @param acl_src The source tensor to be repeated.
|
73
|
+
* @param acl_dst The destination tensor after repeating.
|
74
|
+
* @param repeat_array The array specifying the number of repetitions along each
|
75
|
+
* dimension.
|
76
|
+
*/
|
77
|
+
static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
78
|
+
aclTensor* acl_dst, int64_t* repeat_array) {
|
79
|
+
// repeat tensor along each dim with repeat_array
|
80
|
+
aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
|
81
|
+
|
82
|
+
uint64_t workspaceSize = 0;
|
83
|
+
aclOpExecutor* executor;
|
84
|
+
void* workspaceAddr = nullptr;
|
85
|
+
|
86
|
+
ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
|
87
|
+
&workspaceSize, &executor));
|
88
|
+
|
89
|
+
if (workspaceSize > 0) {
|
90
|
+
// Memory from allocator will "free" immediately, and this memory
|
91
|
+
// will be alloced to other pointers, but it won't access before
|
92
|
+
// this async task end because all tasks in same stream will execute
|
93
|
+
// in queue.
|
94
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
95
|
+
workspaceAddr = workspace_allocator.get();
|
96
|
+
}
|
97
|
+
ACL_CHECK(
|
98
|
+
aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
99
|
+
ACL_CHECK(aclDestroyIntArray(repeats));
|
100
|
+
}
|
101
|
+
|
102
|
+
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
103
|
+
ggml_tensor* src = dst->src[0];
|
104
|
+
GGML_ASSERT(ggml_can_repeat(src, dst));
|
105
|
+
|
106
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
107
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
108
|
+
|
109
|
+
int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
|
110
|
+
dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
|
111
|
+
|
112
|
+
aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
|
113
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
114
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
115
|
+
}
|
116
|
+
|
117
|
+
/**
|
118
|
+
* @brief Adds two tensors element-wise and stores the result in a destination
|
119
|
+
* tensor.
|
120
|
+
*
|
121
|
+
* This function performs the operation:
|
122
|
+
* \f[
|
123
|
+
* dst = acl\_src0 + alpha \times acl\_src1
|
124
|
+
* \f]
|
125
|
+
* where alpha is a scalar value and defaults to 1.0f.
|
126
|
+
*
|
127
|
+
* @param ctx The context for the CANN backend operations.
|
128
|
+
* @param acl_src0 The first source tensor.
|
129
|
+
* @param acl_src1 The second source tensor.
|
130
|
+
* @param acl_dst The destination tensor where the result will be stored.
|
131
|
+
*/
|
132
|
+
static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
|
133
|
+
aclTensor* acl_src1, aclTensor* acl_dst) {
|
134
|
+
aclScalar* alpha = nullptr;
|
135
|
+
float alphaValue = 1.0f;
|
136
|
+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
|
137
|
+
|
138
|
+
uint64_t workspaceSize = 0;
|
139
|
+
aclOpExecutor* executor;
|
140
|
+
void* workspaceAddr = nullptr;
|
141
|
+
|
142
|
+
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
|
143
|
+
&workspaceSize, &executor));
|
144
|
+
if (workspaceSize > 0) {
|
145
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
146
|
+
workspaceAddr = workspace_allocator.get();
|
147
|
+
}
|
148
|
+
|
149
|
+
ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
150
|
+
|
151
|
+
ACL_CHECK(aclDestroyScalar(alpha));
|
152
|
+
}
|
153
|
+
|
154
|
+
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
155
|
+
ggml_tensor* src0 = dst->src[0];
|
156
|
+
ggml_tensor* src1 = dst->src[1];
|
157
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
158
|
+
|
159
|
+
aclTensor* acl_src0;
|
160
|
+
aclTensor* acl_src1;
|
161
|
+
aclTensor* acl_dst;
|
162
|
+
|
163
|
+
// Need bcast
|
164
|
+
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
|
165
|
+
BCAST_SHAPE(src0, src1)
|
166
|
+
acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
|
167
|
+
acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
|
168
|
+
acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
|
169
|
+
} else {
|
170
|
+
acl_src0 = ggml_cann_create_tensor(src0);
|
171
|
+
acl_src1 = ggml_cann_create_tensor(src1);
|
172
|
+
acl_dst = ggml_cann_create_tensor(dst);
|
173
|
+
}
|
174
|
+
|
175
|
+
aclnn_add(ctx, acl_src0, acl_src1, acl_dst);
|
176
|
+
|
177
|
+
ACL_CHECK(aclDestroyTensor(acl_src0));
|
178
|
+
ACL_CHECK(aclDestroyTensor(acl_src1));
|
179
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
180
|
+
}
|
181
|
+
|
182
|
+
void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
183
|
+
ggml_tensor* src = dst->src[0];
|
184
|
+
|
185
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
186
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
187
|
+
|
188
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
189
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
190
|
+
|
191
|
+
float negative_slope;
|
192
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
193
|
+
aclScalar* acl_negative_slope =
|
194
|
+
aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
|
195
|
+
|
196
|
+
uint64_t workspaceSize = 0;
|
197
|
+
aclOpExecutor* executor;
|
198
|
+
void* workspaceAddr = nullptr;
|
199
|
+
|
200
|
+
ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
|
201
|
+
acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
|
202
|
+
if (workspaceSize > 0) {
|
203
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
204
|
+
workspaceAddr = workspace_allocator.get();
|
205
|
+
}
|
206
|
+
|
207
|
+
ACL_CHECK(
|
208
|
+
aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
209
|
+
|
210
|
+
ACL_CHECK(aclDestroyScalar(acl_negative_slope));
|
211
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
212
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
213
|
+
}
|
214
|
+
|
215
|
+
/**
|
216
|
+
* @brief Concatenates a list of tensors along a specified dimension and stores
|
217
|
+
* the result in a destination tensor.
|
218
|
+
*
|
219
|
+
* @param ctx The context for the CANN backend operations.
|
220
|
+
* @param tensorList The list of tensors to be concatenated.
|
221
|
+
* @param acl_dst The destination tensor where the concatenated result will be
|
222
|
+
* stored.
|
223
|
+
* @param concat_dim The dimension along which the tensors will be concatenated.
|
224
|
+
*/
|
225
|
+
static void aclnn_concat(ggml_backend_cann_context& ctx,
|
226
|
+
aclTensorList* tensorList, aclTensor* acl_dst,
|
227
|
+
int64_t concat_dim) {
|
228
|
+
uint64_t workspaceSize = 0;
|
229
|
+
aclOpExecutor* executor;
|
230
|
+
void* workspaceAddr = nullptr;
|
231
|
+
|
232
|
+
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
|
233
|
+
&workspaceSize, &executor));
|
234
|
+
if (workspaceSize > 0) {
|
235
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
236
|
+
workspaceAddr = workspace_allocator.get();
|
237
|
+
}
|
238
|
+
|
239
|
+
ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
240
|
+
}
|
241
|
+
|
242
|
+
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
243
|
+
ggml_tensor* src0 = dst->src[0];
|
244
|
+
ggml_tensor* src1 = dst->src[1];
|
245
|
+
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
246
|
+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
247
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
248
|
+
|
249
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
250
|
+
|
251
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
252
|
+
int32_t acl_dim = 3 - dim;
|
253
|
+
|
254
|
+
aclTensor* tensors[] = {acl_src0, acl_src1};
|
255
|
+
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
|
256
|
+
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
|
257
|
+
|
258
|
+
ACL_CHECK(aclDestroyTensorList(tensorList));
|
259
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
260
|
+
}
|
261
|
+
|
262
|
+
/**
|
263
|
+
* @brief Creates a tensor with values starting from `start`, incremented by
|
264
|
+
* `step`, and ending before `stop`.
|
265
|
+
*
|
266
|
+
* This function performs the operation:
|
267
|
+
* \f[
|
268
|
+
* \text {out }_{i+1}=\text {out }_i+\text {step}
|
269
|
+
* \f]
|
270
|
+
* the range is [start, stop).
|
271
|
+
*
|
272
|
+
* @param ctx The context for the CANN backend operations.
|
273
|
+
* @param acl_dst The destination tensor where the values will be stored.
|
274
|
+
* @param start The starting value of the range.
|
275
|
+
* @param stop The ending value of the range (exclusive).
|
276
|
+
* @param step The step size between consecutive values.
|
277
|
+
* @param n_elements The number of elements in the destination tensor.
|
278
|
+
*/
|
279
|
+
static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
|
280
|
+
float start, float stop, float step,
|
281
|
+
int64_t n_elements) {
|
282
|
+
int64_t steps = (int64_t)std::ceil((stop - start) / step);
|
283
|
+
GGML_ASSERT(n_elements == steps);
|
284
|
+
|
285
|
+
uint64_t workspaceSize = 0;
|
286
|
+
aclOpExecutor* executor;
|
287
|
+
void* workspaceAddr = nullptr;
|
288
|
+
|
289
|
+
aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
|
290
|
+
aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
|
291
|
+
aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
|
292
|
+
|
293
|
+
ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
|
294
|
+
&workspaceSize, &executor));
|
295
|
+
if (workspaceSize > 0) {
|
296
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
297
|
+
workspaceAddr = workspace_allocator.get();
|
298
|
+
}
|
299
|
+
|
300
|
+
ACL_CHECK(
|
301
|
+
aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
302
|
+
|
303
|
+
ACL_CHECK(aclDestroyScalar(acl_start));
|
304
|
+
ACL_CHECK(aclDestroyScalar(acl_end));
|
305
|
+
ACL_CHECK(aclDestroyScalar(acl_step));
|
306
|
+
}
|
307
|
+
|
308
|
+
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
309
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
310
|
+
|
311
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
312
|
+
|
313
|
+
int64_t n_elements = ggml_nelements(dst);
|
314
|
+
float start;
|
315
|
+
float stop;
|
316
|
+
float step;
|
317
|
+
memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
|
318
|
+
memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
|
319
|
+
memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
|
320
|
+
|
321
|
+
aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
|
322
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
323
|
+
}
|
324
|
+
|
325
|
+
void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
326
|
+
dst->src[1] = dst->src[0];
|
327
|
+
ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
|
328
|
+
}
|
329
|
+
|
330
|
+
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
331
|
+
ggml_tensor* src = dst->src[0];
|
332
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
333
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
334
|
+
|
335
|
+
float min;
|
336
|
+
float max;
|
337
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
338
|
+
memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
|
339
|
+
|
340
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
341
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
342
|
+
|
343
|
+
aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
|
344
|
+
aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
|
345
|
+
|
346
|
+
uint64_t workspaceSize = 0;
|
347
|
+
aclOpExecutor* executor;
|
348
|
+
void* workspaceAddr = nullptr;
|
349
|
+
|
350
|
+
ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
|
351
|
+
&workspaceSize, &executor));
|
352
|
+
if (workspaceSize > 0) {
|
353
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
354
|
+
workspaceAddr = workspace_allocator.get();
|
355
|
+
}
|
356
|
+
|
357
|
+
ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
358
|
+
|
359
|
+
ACL_CHECK(aclDestroyScalar(acl_min));
|
360
|
+
ACL_CHECK(aclDestroyScalar(acl_max));
|
361
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
362
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
363
|
+
}
|
364
|
+
|
365
|
+
void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
366
|
+
ggml_tensor* src = dst->src[0];
|
367
|
+
|
368
|
+
// scale factor
|
369
|
+
float v;
|
370
|
+
memcpy(&v, dst->op_params, sizeof(float));
|
371
|
+
|
372
|
+
aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
|
373
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
374
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
375
|
+
|
376
|
+
uint64_t workspaceSize = 0;
|
377
|
+
aclOpExecutor* executor;
|
378
|
+
void* workspaceAddr = nullptr;
|
379
|
+
|
380
|
+
ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
|
381
|
+
&executor));
|
382
|
+
if (workspaceSize > 0) {
|
383
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
384
|
+
workspaceAddr = workspace_allocator.get();
|
385
|
+
}
|
386
|
+
|
387
|
+
ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
388
|
+
|
389
|
+
ACL_CHECK(aclDestroyScalar(scale));
|
390
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
391
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
392
|
+
}
|
393
|
+
|
394
|
+
void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
395
|
+
ggml_tensor* src = dst->src[0];
|
396
|
+
enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
|
397
|
+
|
398
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
399
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
400
|
+
ggml_cann_pool_alloc temp_buffer_allocator(
|
401
|
+
ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
|
402
|
+
void* buffer = temp_buffer_allocator.get();
|
403
|
+
aclTensor* tmp_tensor =
|
404
|
+
ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
|
405
|
+
dst->ne, dst->nb, GGML_MAX_DIMS);
|
406
|
+
|
407
|
+
uint64_t workspaceSize = 0;
|
408
|
+
aclOpExecutor* executor;
|
409
|
+
void* workspaceAddr = nullptr;
|
410
|
+
|
411
|
+
ACL_CHECK(aclnnArgsortGetWorkspaceSize(
|
412
|
+
acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
|
413
|
+
&workspaceSize, &executor));
|
414
|
+
if (workspaceSize > 0) {
|
415
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
416
|
+
workspaceAddr = workspace_allocator.get();
|
417
|
+
}
|
418
|
+
|
419
|
+
ACL_CHECK(
|
420
|
+
aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
421
|
+
|
422
|
+
workspaceSize = 0;
|
423
|
+
ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor,
|
424
|
+
ggml_cann_type_mapping(dst->type),
|
425
|
+
acl_dst, &workspaceSize, &executor));
|
426
|
+
if (workspaceSize > 0) {
|
427
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
428
|
+
workspaceAddr = workspace_allocator.get();
|
429
|
+
}
|
430
|
+
|
431
|
+
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
432
|
+
|
433
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
434
|
+
ACL_CHECK(aclDestroyTensor(tmp_tensor));
|
435
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
436
|
+
}
|
437
|
+
|
438
|
+
void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
439
|
+
ggml_tensor* src = dst->src[0];
|
440
|
+
|
441
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
442
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
443
|
+
|
444
|
+
float eps;
|
445
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
446
|
+
|
447
|
+
uint64_t workspaceSize = 0;
|
448
|
+
aclOpExecutor* executor;
|
449
|
+
void* workspaceAddr = nullptr;
|
450
|
+
|
451
|
+
std::vector<int64_t> normData = {dst->ne[0]};
|
452
|
+
aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
|
453
|
+
ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
|
454
|
+
eps, acl_dst, nullptr, nullptr,
|
455
|
+
&workspaceSize, &executor));
|
456
|
+
|
457
|
+
if (workspaceSize > 0) {
|
458
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
459
|
+
workspaceAddr = workspace_allocator.get();
|
460
|
+
}
|
461
|
+
|
462
|
+
ACL_CHECK(
|
463
|
+
aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
464
|
+
|
465
|
+
ACL_CHECK(aclDestroyIntArray(norm));
|
466
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
467
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
468
|
+
}
|
469
|
+
|
470
|
+
void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
471
|
+
ggml_tensor* src = dst->src[0];
|
472
|
+
|
473
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
474
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
475
|
+
|
476
|
+
int n_groups = dst->op_params[0];
|
477
|
+
|
478
|
+
float eps;
|
479
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
480
|
+
|
481
|
+
uint64_t workspaceSize = 0;
|
482
|
+
aclOpExecutor* executor;
|
483
|
+
void* workspaceAddr = nullptr;
|
484
|
+
|
485
|
+
int64_t N = src->ne[3];
|
486
|
+
int64_t C = src->ne[2];
|
487
|
+
int64_t HxW = src->ne[1] * src->ne[0];
|
488
|
+
|
489
|
+
size_t type_size = ggml_type_size(src->type);
|
490
|
+
int64_t ne[] = {n_groups, N};
|
491
|
+
size_t nb[] = {type_size, type_size * n_groups};
|
492
|
+
size_t n_bytes = N * n_groups;
|
493
|
+
|
494
|
+
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
|
495
|
+
void* buffer = temp_buffer_allocator.get();
|
496
|
+
aclTensor* acl_mean_out = ggml_cann_create_tensor(
|
497
|
+
buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
|
498
|
+
aclTensor* acl_rstd_out = ggml_cann_create_tensor(
|
499
|
+
(char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
|
500
|
+
|
501
|
+
ACL_CHECK(aclnnGroupNormGetWorkspaceSize(
|
502
|
+
acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst,
|
503
|
+
acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
|
504
|
+
|
505
|
+
if (workspaceSize > 0) {
|
506
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
507
|
+
workspaceAddr = workspace_allocator.get();
|
508
|
+
}
|
509
|
+
|
510
|
+
ACL_CHECK(
|
511
|
+
aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
512
|
+
|
513
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
514
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
515
|
+
ACL_CHECK(aclDestroyTensor(acl_mean_out));
|
516
|
+
ACL_CHECK(aclDestroyTensor(acl_rstd_out));
|
517
|
+
}
|
518
|
+
|
519
|
+
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
520
|
+
ggml_tensor* src0 = dst->src[0];
|
521
|
+
ggml_tensor* src1 = dst->src[1];
|
522
|
+
|
523
|
+
size_t nb1 = ((int32_t*)dst->op_params)[0];
|
524
|
+
size_t nb2 = ((int32_t*)dst->op_params)[1];
|
525
|
+
size_t nb3 = ((int32_t*)dst->op_params)[2];
|
526
|
+
size_t offset = ((int32_t*)dst->op_params)[3];
|
527
|
+
bool inplace = (bool)((int32_t*)dst->op_params)[4];
|
528
|
+
|
529
|
+
size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
|
530
|
+
|
531
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(
|
532
|
+
dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
|
533
|
+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
534
|
+
|
535
|
+
aclScalar* alpha = nullptr;
|
536
|
+
float alphaValue = 1.0f;
|
537
|
+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
|
538
|
+
|
539
|
+
uint64_t workspaceSize = 0;
|
540
|
+
aclOpExecutor* executor;
|
541
|
+
void* workspaceAddr = nullptr;
|
542
|
+
|
543
|
+
if (!inplace) {
|
544
|
+
size_t cpy_size = ggml_nbytes(dst);
|
545
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
|
546
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
547
|
+
aclTensor* acl_src0 = ggml_cann_create_tensor(
|
548
|
+
src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
|
549
|
+
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
|
550
|
+
&workspaceSize, &executor));
|
551
|
+
if (workspaceSize > 0) {
|
552
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
553
|
+
workspaceAddr = workspace_allocator.get();
|
554
|
+
}
|
555
|
+
ACL_CHECK(
|
556
|
+
aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
557
|
+
ACL_CHECK(aclDestroyTensor(acl_src0));
|
558
|
+
} else {
|
559
|
+
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
|
560
|
+
&workspaceSize, &executor));
|
561
|
+
if (workspaceSize > 0) {
|
562
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
563
|
+
workspaceAddr = workspace_allocator.get();
|
564
|
+
}
|
565
|
+
ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
|
566
|
+
ctx.stream()));
|
567
|
+
}
|
568
|
+
|
569
|
+
ACL_CHECK(aclDestroyTensor(acl_src1));
|
570
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
571
|
+
}
|
572
|
+
|
573
|
+
void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
574
|
+
ggml_tensor* src = dst->src[0];
|
575
|
+
|
576
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
577
|
+
|
578
|
+
GGML_ASSERT(dst->ne[0] == 1);
|
579
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
580
|
+
|
581
|
+
int64_t reduce_dims_host[] = {3};
|
582
|
+
aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1);
|
583
|
+
|
584
|
+
uint64_t workspaceSize = 0;
|
585
|
+
aclOpExecutor* executor;
|
586
|
+
void* workspaceAddr = nullptr;
|
587
|
+
|
588
|
+
ACL_CHECK(aclnnReduceSumGetWorkspaceSize(
|
589
|
+
acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst,
|
590
|
+
&workspaceSize, &executor));
|
591
|
+
if (workspaceSize > 0) {
|
592
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
593
|
+
workspaceAddr = workspace_allocator.get();
|
594
|
+
}
|
595
|
+
|
596
|
+
ACL_CHECK(
|
597
|
+
aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
598
|
+
|
599
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
600
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
601
|
+
}
|
602
|
+
|
603
|
+
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
|
604
|
+
ggml_tensor* dst) {
|
605
|
+
ggml_tensor* src = dst->src[0];
|
606
|
+
aclTensor* acl_src =
|
607
|
+
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
608
|
+
aclTensor* acl_dst =
|
609
|
+
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
610
|
+
|
611
|
+
std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
|
612
|
+
auto output_size_array = aclCreateIntArray(output_size.data(), 2);
|
613
|
+
|
614
|
+
uint64_t workspaceSize = 0;
|
615
|
+
aclOpExecutor* executor;
|
616
|
+
void* workspaceAddr = nullptr;
|
617
|
+
|
618
|
+
ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
|
619
|
+
acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
|
620
|
+
if (workspaceSize > 0) {
|
621
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
622
|
+
workspaceAddr = workspace_allocator.get();
|
623
|
+
}
|
624
|
+
|
625
|
+
ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor,
|
626
|
+
ctx.stream()));
|
627
|
+
|
628
|
+
ACL_CHECK(aclDestroyIntArray(output_size_array));
|
629
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
630
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
631
|
+
}
|
632
|
+
|
633
|
+
/**
|
634
|
+
* @brief Pads a tensor with a specified value along each dimension.
|
635
|
+
*
|
636
|
+
* This function performs padding of the source tensor `acl_src` and stores the
|
637
|
+
* result in the destination tensor `acl_dst`. The padding values for each
|
638
|
+
* dimension are specified in the `paddings` array.
|
639
|
+
*
|
640
|
+
* @param ctx The context for the CANN backend operations.
|
641
|
+
* @param acl_src The source tensor to be padded.
|
642
|
+
* @param acl_dst The destination tensor where the padded result will be stored.
|
643
|
+
* @param paddings An array specifying the padding values for each dimension.
|
644
|
+
* The size of the array should be twice the number of dimensions of the tensor.
|
645
|
+
* @param value The value to be used for padding. The default value is 0.0.
|
646
|
+
*/
|
647
|
+
static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
648
|
+
aclTensor* acl_dst, int64_t* paddings,
|
649
|
+
float value = 0.0f) {
|
650
|
+
aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
|
651
|
+
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
|
652
|
+
|
653
|
+
uint64_t workspaceSize = 0;
|
654
|
+
aclOpExecutor* executor;
|
655
|
+
void* workspaceAddr = nullptr;
|
656
|
+
|
657
|
+
ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize(
|
658
|
+
acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor));
|
659
|
+
|
660
|
+
if (workspaceSize > 0) {
|
661
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
662
|
+
workspaceAddr = workspace_allocator.get();
|
663
|
+
}
|
664
|
+
|
665
|
+
ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor,
|
666
|
+
ctx.stream()));
|
667
|
+
|
668
|
+
ACL_CHECK(aclDestroyIntArray(acl_pad));
|
669
|
+
ACL_CHECK(aclDestroyScalar(acl_value));
|
670
|
+
}
|
671
|
+
|
672
|
+
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
673
|
+
ggml_tensor* src = dst->src[0];
|
674
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
675
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
676
|
+
|
677
|
+
// padding: value in the array means how much distance will be padding.
|
678
|
+
// the position of elements in the array means which dirction to padding,
|
679
|
+
// each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
|
680
|
+
// dim2.front, dim2.behind, dim3.front, dim3.behind]
|
681
|
+
int64_t paddings[] = {
|
682
|
+
0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
|
683
|
+
0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
|
684
|
+
aclnn_pad(ctx, acl_src, acl_dst, paddings);
|
685
|
+
|
686
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
687
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
688
|
+
}
|
689
|
+
|
690
|
+
/**
|
691
|
+
* @brief Performs 2D average pooling on the input tensor and stores the result
|
692
|
+
* in the destination tensor.
|
693
|
+
*
|
694
|
+
* This function performs average pooling on the source tensor and stores the
|
695
|
+
* result in the destination tensor. The pooling parameters (kernel size,
|
696
|
+
* strides, padding) are specified in the `op_params` of the destination tensor.
|
697
|
+
*
|
698
|
+
* @param ctx The context for the CANN backend operations.
|
699
|
+
* @param dst The destination tensor where the result will be stored. The source
|
700
|
+
* tensor is referenced by `dst->src[0]`.
|
701
|
+
*/
|
702
|
+
static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
|
703
|
+
ggml_tensor* dst) {
|
704
|
+
ggml_tensor* src = dst->src[0];
|
705
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
706
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
707
|
+
|
708
|
+
aclTensor* acl_src =
|
709
|
+
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
710
|
+
aclTensor* acl_dst =
|
711
|
+
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
712
|
+
|
713
|
+
const int32_t* opts = (const int32_t*)dst->op_params;
|
714
|
+
const int k0 = opts[1];
|
715
|
+
const int k1 = opts[2];
|
716
|
+
const int s0 = opts[3];
|
717
|
+
const int s1 = opts[4];
|
718
|
+
const int p0 = opts[5];
|
719
|
+
const int p1 = opts[6];
|
720
|
+
|
721
|
+
std::vector<int64_t> kernel_dims = {k1, k0};
|
722
|
+
std::vector<int64_t> stride_dims = {s1, s0};
|
723
|
+
std::vector<int64_t> padding_avg_dims = {p1, p0}; // (padH, padW)
|
724
|
+
|
725
|
+
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
|
726
|
+
auto* strides = aclCreateIntArray(stride_dims.data(), 2);
|
727
|
+
auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
|
728
|
+
|
729
|
+
bool ceil_mode = false;
|
730
|
+
bool count_include_pad = true;
|
731
|
+
int64_t divisor_override = 0;
|
732
|
+
int8_t cube_math_type = 0;
|
733
|
+
|
734
|
+
uint64_t workspaceSize = 0;
|
735
|
+
aclOpExecutor* executor;
|
736
|
+
void* workspaceAddr = nullptr;
|
737
|
+
|
738
|
+
ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
|
739
|
+
acl_src, kernel_size, strides, paddings_avg, ceil_mode,
|
740
|
+
count_include_pad, divisor_override, cube_math_type, acl_dst,
|
741
|
+
&workspaceSize, &executor));
|
742
|
+
|
743
|
+
if (workspaceSize > 0) {
|
744
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
745
|
+
workspaceAddr = workspace_allocator.get();
|
746
|
+
}
|
747
|
+
ACL_CHECK(
|
748
|
+
aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
749
|
+
|
750
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
751
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
752
|
+
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
753
|
+
ACL_CHECK(aclDestroyIntArray(strides));
|
754
|
+
ACL_CHECK(aclDestroyIntArray(paddings_avg));
|
755
|
+
}
|
756
|
+
|
757
|
+
/**
|
758
|
+
* @brief Performs 2D max pooling on the input tensor and stores the result in
|
759
|
+
* the destination tensor.
|
760
|
+
*
|
761
|
+
* This function performs max pooling on the source tensor and stores the result
|
762
|
+
* in the destination tensor. The pooling parameters (kernel size, strides,
|
763
|
+
* padding) are specified in the `op_params` of the destination tensor.
|
764
|
+
*
|
765
|
+
* @param ctx The context for the CANN backend operations.
|
766
|
+
* @param dst The destination tensor where the result will be stored. The source
|
767
|
+
* tensor is referenced by `dst->src[0]`.
|
768
|
+
*/
|
769
|
+
static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
|
770
|
+
ggml_tensor* dst) {
|
771
|
+
ggml_tensor* src = dst->src[0];
|
772
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
773
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
774
|
+
|
775
|
+
aclTensor* acl_src =
|
776
|
+
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
777
|
+
aclTensor* acl_dst =
|
778
|
+
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
779
|
+
|
780
|
+
const int32_t* opts = (const int32_t*)dst->op_params;
|
781
|
+
const int k0 = opts[1];
|
782
|
+
const int k1 = opts[2];
|
783
|
+
const int s0 = opts[3];
|
784
|
+
const int s1 = opts[4];
|
785
|
+
const int p0 = opts[5];
|
786
|
+
const int p1 = opts[6];
|
787
|
+
|
788
|
+
int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
|
789
|
+
src->ne[3]};
|
790
|
+
size_t temp_nb[GGML_MAX_DIMS];
|
791
|
+
|
792
|
+
temp_nb[0] = ggml_element_size(src);
|
793
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
794
|
+
temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
|
795
|
+
}
|
796
|
+
|
797
|
+
ggml_cann_pool_alloc temp_buffer_allocator(
|
798
|
+
ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
|
799
|
+
void* buffer = temp_buffer_allocator.get();
|
800
|
+
aclTensor* tmp_tensor = ggml_cann_create_tensor(
|
801
|
+
buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
|
802
|
+
GGML_MAX_DIMS, ACL_FORMAT_NCHW);
|
803
|
+
|
804
|
+
// pad: see padding in ggml_cann_pad()
|
805
|
+
int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
|
806
|
+
float value = -FLT_MAX;
|
807
|
+
aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
|
808
|
+
|
809
|
+
// max_pool
|
810
|
+
std::vector<int64_t> kernel_dims = {k1, k0};
|
811
|
+
std::vector<int64_t> stride_dims = {s1, s0};
|
812
|
+
// padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
|
813
|
+
std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
|
814
|
+
std::vector<int64_t> dilation_size = {1, 1};
|
815
|
+
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
|
816
|
+
auto* strides = aclCreateIntArray(stride_dims.data(), 2);
|
817
|
+
auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
|
818
|
+
auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
|
819
|
+
|
820
|
+
bool ceil_mode = false;
|
821
|
+
int64_t auto_pads = 0;
|
822
|
+
|
823
|
+
uint64_t workspaceSize = 0;
|
824
|
+
aclOpExecutor* executor;
|
825
|
+
void* workspaceAddr = nullptr;
|
826
|
+
|
827
|
+
ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
|
828
|
+
tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
|
829
|
+
ceil_mode, acl_dst, &workspaceSize, &executor));
|
830
|
+
if (workspaceSize > 0) {
|
831
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
832
|
+
workspaceAddr = workspace_allocator.get();
|
833
|
+
}
|
834
|
+
|
835
|
+
ACL_CHECK(
|
836
|
+
aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
837
|
+
|
838
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
839
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
840
|
+
ACL_CHECK(aclDestroyTensor(tmp_tensor));
|
841
|
+
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
842
|
+
ACL_CHECK(aclDestroyIntArray(strides));
|
843
|
+
ACL_CHECK(aclDestroyIntArray(paddings_max));
|
844
|
+
ACL_CHECK(aclDestroyIntArray(dilations));
|
845
|
+
}
|
846
|
+
|
847
|
+
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
848
|
+
const int32_t* opts = (const int32_t*)dst->op_params;
|
849
|
+
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
850
|
+
switch (op) {
|
851
|
+
case GGML_OP_POOL_AVG:
|
852
|
+
ggml_cann_avg_pool2d(ctx, dst);
|
853
|
+
break;
|
854
|
+
case GGML_OP_POOL_MAX:
|
855
|
+
ggml_cann_max_pool2d(ctx, dst);
|
856
|
+
break;
|
857
|
+
case GGML_OP_POOL_COUNT:
|
858
|
+
GGML_ABORT("fatal error");
|
859
|
+
break;
|
860
|
+
}
|
861
|
+
}
|
862
|
+
|
863
|
+
/**
|
864
|
+
* @brief Copies data from the source tensor to the destination tensor.
|
865
|
+
*
|
866
|
+
* This function copies data from the source tensor `acl_src` to the destination
|
867
|
+
* tensor `acl_dst`.
|
868
|
+
*
|
869
|
+
* @param ctx The context for the CANN backend operations.
|
870
|
+
* @param acl_src The source tensor from which data will be copied.
|
871
|
+
* @param acl_dst The destination tensor where the data will be copied to.
|
872
|
+
*/
|
873
|
+
static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
874
|
+
aclTensor* acl_dst) {
|
875
|
+
uint64_t workspaceSize = 0;
|
876
|
+
aclOpExecutor* executor;
|
877
|
+
void* workspaceAddr = nullptr;
|
878
|
+
|
879
|
+
ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize,
|
880
|
+
&executor));
|
881
|
+
|
882
|
+
if (workspaceSize > 0) {
|
883
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
884
|
+
workspaceAddr = workspace_allocator.get();
|
885
|
+
}
|
886
|
+
|
887
|
+
ACL_CHECK(
|
888
|
+
aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
889
|
+
}
|
890
|
+
|
891
|
+
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
892
|
+
ggml_tensor* src = dst->src[0];
|
893
|
+
|
894
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
895
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
896
|
+
|
897
|
+
ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
898
|
+
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
899
|
+
src->extra = src_extra_allocator.get();
|
900
|
+
dst->extra = dst_extra_allocator.get();
|
901
|
+
ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
|
902
|
+
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
903
|
+
ctx.stream()));
|
904
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
905
|
+
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
906
|
+
ctx.stream()));
|
907
|
+
|
908
|
+
if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
|
909
|
+
ggml_are_same_shape(src, dst)) {
|
910
|
+
cann_copy(ctx, acl_src, acl_dst);
|
911
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
912
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
913
|
+
return;
|
914
|
+
}
|
915
|
+
// TODO: simplify
|
916
|
+
if (src->type == GGML_TYPE_F16) {
|
917
|
+
if (dst->type == GGML_TYPE_Q8_0) {
|
918
|
+
aclrtlaunch_ascendc_quantize_f16_q8_0(
|
919
|
+
24, ctx.stream(), src->data, dst->data,
|
920
|
+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
921
|
+
((ggml_tensor*)dst->extra)->ne);
|
922
|
+
return;
|
923
|
+
}
|
924
|
+
if (dst->type == GGML_TYPE_Q4_0) {
|
925
|
+
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
926
|
+
24, ctx.stream(), src->data, dst->data,
|
927
|
+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
928
|
+
((ggml_tensor*)dst->extra)->ne);
|
929
|
+
return;
|
930
|
+
}
|
931
|
+
if (dst->type == GGML_TYPE_F16) {
|
932
|
+
if (ggml_are_same_shape(src, dst)) {
|
933
|
+
cann_copy(ctx, acl_src, acl_dst);
|
934
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
935
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
936
|
+
return;
|
937
|
+
}
|
938
|
+
if (ggml_is_contiguous(dst)) {
|
939
|
+
const size_t src_type_size = ggml_type_size(src->type);
|
940
|
+
if (src->nb[0] == src_type_size) {
|
941
|
+
// src0 is contigous on first dimension, copy by rows
|
942
|
+
int64_t rows_num = ggml_nrows(src);
|
943
|
+
|
944
|
+
aclrtlaunch_ascendc_dup_by_rows_fp16(
|
945
|
+
rows_num, ctx.stream(), src->data, dst->data,
|
946
|
+
((ggml_tensor*)src->extra)->ne,
|
947
|
+
((ggml_tensor*)src->extra)->nb,
|
948
|
+
((ggml_tensor*)dst->extra)->ne,
|
949
|
+
((ggml_tensor*)dst->extra)->nb);
|
950
|
+
return;
|
951
|
+
}
|
952
|
+
GGML_ABORT("fatal error");
|
953
|
+
}
|
954
|
+
GGML_ABORT("fatal error");
|
955
|
+
}
|
956
|
+
if (dst->type == GGML_TYPE_F32) {
|
957
|
+
if (ggml_are_same_shape(src, dst)) {
|
958
|
+
cann_copy(ctx, acl_src, acl_dst);
|
959
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
960
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
961
|
+
return;
|
962
|
+
}
|
963
|
+
if (ggml_is_contiguous(dst)) {
|
964
|
+
const size_t src_type_size = ggml_type_size(src->type);
|
965
|
+
if (src->nb[0] == src_type_size) {
|
966
|
+
// src0 is contigous on first dimension, copy by rows
|
967
|
+
int64_t rows_num = ggml_nrows(src);
|
968
|
+
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
|
969
|
+
rows_num, ctx.stream(), src->data, dst->data,
|
970
|
+
((ggml_tensor*)src->extra)->ne,
|
971
|
+
((ggml_tensor*)src->extra)->nb,
|
972
|
+
((ggml_tensor*)dst->extra)->ne,
|
973
|
+
((ggml_tensor*)dst->extra)->nb);
|
974
|
+
return;
|
975
|
+
}
|
976
|
+
GGML_ABORT("fatal error");
|
977
|
+
}
|
978
|
+
GGML_ABORT("fatal error");
|
979
|
+
}
|
980
|
+
// TODO
|
981
|
+
GGML_ABORT("fatal error");
|
982
|
+
} else if (src->type == GGML_TYPE_F32) {
|
983
|
+
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
984
|
+
// && nb0 == type_size)
|
985
|
+
if (dst->type == GGML_TYPE_Q8_0) {
|
986
|
+
aclrtlaunch_ascendc_quantize_f32_q8_0(
|
987
|
+
24, ctx.stream(), src->data, dst->data,
|
988
|
+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
989
|
+
((ggml_tensor*)dst->extra)->ne);
|
990
|
+
return;
|
991
|
+
}
|
992
|
+
if (dst->type == GGML_TYPE_Q4_0) {
|
993
|
+
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
994
|
+
24, ctx.stream(), src->data, dst->data,
|
995
|
+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
996
|
+
((ggml_tensor*)dst->extra)->ne);
|
997
|
+
return;
|
998
|
+
}
|
999
|
+
if (dst->type == GGML_TYPE_F32) {
|
1000
|
+
if (ggml_are_same_shape(src, dst)) {
|
1001
|
+
cann_copy(ctx, acl_src, acl_dst);
|
1002
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
1003
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1004
|
+
return;
|
1005
|
+
}
|
1006
|
+
if (ggml_is_contiguous(dst)) {
|
1007
|
+
const size_t src_type_size = ggml_type_size(src->type);
|
1008
|
+
if (src->nb[0] == src_type_size) {
|
1009
|
+
// src0 is contigous on first dimension, copy by rows
|
1010
|
+
int64_t rows_num = ggml_nrows(src);
|
1011
|
+
aclrtlaunch_ascendc_dup_by_rows_fp32(
|
1012
|
+
rows_num, ctx.stream(), src->data, dst->data,
|
1013
|
+
((ggml_tensor*)src->extra)->ne,
|
1014
|
+
((ggml_tensor*)src->extra)->nb,
|
1015
|
+
((ggml_tensor*)dst->extra)->ne,
|
1016
|
+
((ggml_tensor*)dst->extra)->nb);
|
1017
|
+
return;
|
1018
|
+
}
|
1019
|
+
GGML_ABORT("fatal error");
|
1020
|
+
} else {
|
1021
|
+
// TODO: dst not contiguous
|
1022
|
+
GGML_ABORT("fatal error");
|
1023
|
+
}
|
1024
|
+
}
|
1025
|
+
if (dst->type == GGML_TYPE_F16) {
|
1026
|
+
if (ggml_are_same_shape(src, dst)) {
|
1027
|
+
cann_copy(ctx, acl_src, acl_dst);
|
1028
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
1029
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1030
|
+
return;
|
1031
|
+
}
|
1032
|
+
if (ggml_is_contiguous(dst)) {
|
1033
|
+
const size_t src_type_size = ggml_type_size(src->type);
|
1034
|
+
if (src->nb[0] == src_type_size) {
|
1035
|
+
// src0 is contigous on first dimension, copy by rows
|
1036
|
+
int64_t rows_num = ggml_nrows(src);
|
1037
|
+
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
|
1038
|
+
rows_num, ctx.stream(), src->data, dst->data,
|
1039
|
+
((ggml_tensor*)src->extra)->ne,
|
1040
|
+
((ggml_tensor*)src->extra)->nb,
|
1041
|
+
((ggml_tensor*)dst->extra)->ne,
|
1042
|
+
((ggml_tensor*)dst->extra)->nb);
|
1043
|
+
return;
|
1044
|
+
}
|
1045
|
+
GGML_ABORT("fatal error");
|
1046
|
+
}
|
1047
|
+
}
|
1048
|
+
// TODO
|
1049
|
+
GGML_ABORT("fatal error");
|
1050
|
+
} else {
|
1051
|
+
if (ggml_are_same_shape(src, dst)) {
|
1052
|
+
cann_copy(ctx, acl_src, acl_dst);
|
1053
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
1054
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1055
|
+
return;
|
1056
|
+
}
|
1057
|
+
GGML_ABORT("fatal error");
|
1058
|
+
}
|
1059
|
+
}
|
1060
|
+
|
1061
|
+
#ifdef __cplusplus
|
1062
|
+
extern "C" {
|
1063
|
+
#endif
|
1064
|
+
aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x,
|
1065
|
+
const aclTensor* gamma, double epsilon,
|
1066
|
+
const aclTensor* yOut,
|
1067
|
+
const aclTensor* rstdOout,
|
1068
|
+
uint64_t* workspaceSize,
|
1069
|
+
aclOpExecutor** executor);
|
1070
|
+
aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
|
1071
|
+
aclOpExecutor* executor, aclrtStream stream);
|
1072
|
+
#ifdef __cplusplus
|
1073
|
+
}
|
1074
|
+
#endif
|
1075
|
+
|
1076
|
+
/**
|
1077
|
+
* @brief Creates an ACL tensor initialized with zeros using a provided buffer.
|
1078
|
+
*
|
1079
|
+
* This function initializes a tensor with zeros using the specified buffer and
|
1080
|
+
* tensor parameters.
|
1081
|
+
*
|
1082
|
+
* @param ctx The context for the CANN backend operations.
|
1083
|
+
* @param buffer The buffer to be used for the tensor data.
|
1084
|
+
* @param n_bytes The size of the buffer in bytes.
|
1085
|
+
* @param ne An array specifying the extents (sizes) of each dimension of the
|
1086
|
+
* tensor.
|
1087
|
+
* @param dims The number of dimensions of the tensor.
|
1088
|
+
* @param type The data type of the tensor.
|
1089
|
+
* @param type_size The size of each element in the tensor data type.
|
1090
|
+
* @return An ACL tensor initialized with zeros.
|
1091
|
+
*/
|
1092
|
+
static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
1093
|
+
size_t n_bytes, int64_t* ne, int64_t dims,
|
1094
|
+
aclDataType type, size_t type_size) {
|
1095
|
+
size_t nb[GGML_MAX_DIMS];
|
1096
|
+
nb[0] = type_size;
|
1097
|
+
for (int i = 1; i < dims; i++) {
|
1098
|
+
nb[i] = nb[i - 1] * ne[i - 1];
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
|
1102
|
+
aclTensor* zero =
|
1103
|
+
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
|
1104
|
+
return zero;
|
1105
|
+
}
|
1106
|
+
|
1107
|
+
/**
|
1108
|
+
* @brief Creates an ACL tensor initialized with value using a provided buffer.
|
1109
|
+
*
|
1110
|
+
* This function initializes a tensor with value using the specified buffer and
|
1111
|
+
* tensor parameters.
|
1112
|
+
*
|
1113
|
+
* @param ctx The context for the CANN backend operations.
|
1114
|
+
* @param buffer The buffer to be used for the tensor data.
|
1115
|
+
* @param n_bytes The size of the buffer in bytes.
|
1116
|
+
* @param ne An array specifying the extents (sizes) of each dimension of the
|
1117
|
+
* tensor.
|
1118
|
+
* @param dims The number of dimensions of the tensor.
|
1119
|
+
* @param type The data type of the tensor.
|
1120
|
+
* @param type_size The size of each element in the tensor data type.
|
1121
|
+
* @param value The value to be used for initializing the tensor (default
|
1122
|
+
* is 1.0).
|
1123
|
+
* @return An ACL tensor initialized with value.
|
1124
|
+
*/
|
1125
|
+
static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
|
1126
|
+
size_t n_bytes, int64_t* ne, int64_t dims,
|
1127
|
+
aclDataType type, size_t type_size,
|
1128
|
+
float value = 1.0f) {
|
1129
|
+
aclTensor* acl_tensor =
|
1130
|
+
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
|
1131
|
+
float alpha_host = 1.0f;
|
1132
|
+
aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
|
1133
|
+
aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
|
1134
|
+
|
1135
|
+
uint64_t workspaceSize = 0;
|
1136
|
+
aclOpExecutor* executor;
|
1137
|
+
void* workspaceAddr = nullptr;
|
1138
|
+
|
1139
|
+
ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha,
|
1140
|
+
&workspaceSize, &executor));
|
1141
|
+
|
1142
|
+
if (workspaceSize > 0) {
|
1143
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1144
|
+
workspaceAddr = workspace_allocator.get();
|
1145
|
+
}
|
1146
|
+
ACL_CHECK(
|
1147
|
+
aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1148
|
+
|
1149
|
+
return acl_tensor;
|
1150
|
+
}
|
1151
|
+
|
1152
|
+
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
1153
|
+
ggml_tensor* src = dst->src[0];
|
1154
|
+
|
1155
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
1156
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
1157
|
+
|
1158
|
+
float eps;
|
1159
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
1160
|
+
|
1161
|
+
GGML_ASSERT(eps > 0.0f);
|
1162
|
+
|
1163
|
+
uint64_t workspaceSize = 0;
|
1164
|
+
aclOpExecutor* executor;
|
1165
|
+
void* workspaceAddr = nullptr;
|
1166
|
+
|
1167
|
+
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
|
1168
|
+
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
1169
|
+
|
1170
|
+
aclTensor* acl_gamma = aclnn_values(
|
1171
|
+
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
|
1172
|
+
ggml_cann_type_mapping(src->type), ggml_element_size(src));
|
1173
|
+
|
1174
|
+
size_t zero_tensor_n_bytes =
|
1175
|
+
src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
|
1176
|
+
ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
|
1177
|
+
aclTensor* acl_rstd =
|
1178
|
+
aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
|
1179
|
+
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
|
1180
|
+
ggml_element_size(src));
|
1181
|
+
|
1182
|
+
ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
|
1183
|
+
acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
|
1184
|
+
|
1185
|
+
if (workspaceSize > 0) {
|
1186
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1187
|
+
workspaceAddr = workspace_allocator.get();
|
1188
|
+
}
|
1189
|
+
|
1190
|
+
ACL_CHECK(
|
1191
|
+
aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1192
|
+
|
1193
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
1194
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1195
|
+
ACL_CHECK(aclDestroyTensor(acl_gamma));
|
1196
|
+
ACL_CHECK(aclDestroyTensor(acl_rstd));
|
1197
|
+
}
|
1198
|
+
|
1199
|
+
// TODO: performace is low.
|
1200
|
+
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
1201
|
+
float value) {
|
1202
|
+
ggml_tensor* src = dst->src[0];
|
1203
|
+
|
1204
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
1205
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
1206
|
+
|
1207
|
+
const int n_past = ((int32_t*)dst->op_params)[0];
|
1208
|
+
|
1209
|
+
size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
|
1210
|
+
src->ne[3] * ggml_element_size(src);
|
1211
|
+
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
1212
|
+
|
1213
|
+
aclTensor* mask_tensor =
|
1214
|
+
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
|
1215
|
+
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
|
1216
|
+
ggml_element_size(src), value);
|
1217
|
+
|
1218
|
+
uint64_t workspaceSize = 0;
|
1219
|
+
aclOpExecutor* executor;
|
1220
|
+
void* workspaceAddr = nullptr;
|
1221
|
+
|
1222
|
+
ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
|
1223
|
+
&workspaceSize, &executor));
|
1224
|
+
if (workspaceSize > 0) {
|
1225
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1226
|
+
workspaceAddr = workspace_allocator.get();
|
1227
|
+
}
|
1228
|
+
|
1229
|
+
ACL_CHECK(
|
1230
|
+
aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1231
|
+
|
1232
|
+
ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
|
1233
|
+
&workspaceSize, &executor));
|
1234
|
+
if (workspaceSize > 0) {
|
1235
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1236
|
+
workspaceAddr = workspace_allocator.get();
|
1237
|
+
}
|
1238
|
+
|
1239
|
+
ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1240
|
+
|
1241
|
+
aclScalar* alpha = nullptr;
|
1242
|
+
float alphaValue = 1.0f;
|
1243
|
+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
|
1244
|
+
|
1245
|
+
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha,
|
1246
|
+
&workspaceSize, &executor));
|
1247
|
+
if (workspaceSize > 0) {
|
1248
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1249
|
+
workspaceAddr = workspace_allocator.get();
|
1250
|
+
}
|
1251
|
+
ACL_CHECK(
|
1252
|
+
aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1253
|
+
|
1254
|
+
ACL_CHECK(aclDestroyScalar(alpha));
|
1255
|
+
ACL_CHECK(aclDestroyTensor(mask_tensor));
|
1256
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
1257
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1258
|
+
}
|
1259
|
+
|
1260
|
+
/**
|
1261
|
+
* @brief Casts the data type of a source tensor to a destination tensor.
|
1262
|
+
*
|
1263
|
+
* This function casts the data type of the source tensor `acl_src` to the
|
1264
|
+
* specified data type `cast_data_type` and stores the result in the destination
|
1265
|
+
* tensor `acl_dst`.
|
1266
|
+
*
|
1267
|
+
* @param ctx The context for the CANN backend operations.
|
1268
|
+
* @param acl_src The source tensor whose data type will be casted.
|
1269
|
+
* @param acl_dst The destination tensor where the casted result will be stored.
|
1270
|
+
* @param cast_data_type The target data type to which the source tensor will be
|
1271
|
+
* casted.
|
1272
|
+
*/
|
1273
|
+
static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1274
|
+
aclTensor* acl_dst, aclDataType cast_data_type) {
|
1275
|
+
uint64_t workspaceSize = 0;
|
1276
|
+
aclOpExecutor* executor;
|
1277
|
+
void* workspaceAddr = nullptr;
|
1278
|
+
|
1279
|
+
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
|
1280
|
+
&workspaceSize, &executor));
|
1281
|
+
if (workspaceSize > 0) {
|
1282
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1283
|
+
workspaceAddr = workspace_allocator.get();
|
1284
|
+
}
|
1285
|
+
|
1286
|
+
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
/**
|
1290
|
+
* @brief Permutes the dimensions of a tensor according to a specified order.
|
1291
|
+
*
|
1292
|
+
* This function permutes the dimensions of the source tensor `acl_src`
|
1293
|
+
* according to the order specified in the `new_dim` array and stores the result
|
1294
|
+
* in the destination tensor `acl_dst`.
|
1295
|
+
*
|
1296
|
+
* @param ctx The context for the CANN backend operations.
|
1297
|
+
* @param acl_src The source tensor whose dimensions will be permuted.
|
1298
|
+
* @param acl_dst The destination tensor where the permuted result will be
|
1299
|
+
* stored.
|
1300
|
+
* @param new_dim An array specifying the new order of dimensions for the
|
1301
|
+
* tensor.
|
1302
|
+
* @param dims The number of dimensions in the tensor.
|
1303
|
+
*/
|
1304
|
+
static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1305
|
+
aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
|
1306
|
+
aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
|
1307
|
+
|
1308
|
+
uint64_t workspaceSize = 0;
|
1309
|
+
aclOpExecutor* executor;
|
1310
|
+
void* workspaceAddr = nullptr;
|
1311
|
+
|
1312
|
+
ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
|
1313
|
+
&workspaceSize, &executor));
|
1314
|
+
if (workspaceSize > 0) {
|
1315
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1316
|
+
workspaceAddr = workspace_allocator.get();
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
ACL_CHECK(
|
1320
|
+
aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1321
|
+
|
1322
|
+
ACL_CHECK(aclDestroyIntArray(acl_dims));
|
1323
|
+
}
|
1324
|
+
|
1325
|
+
#ifdef __cplusplus
|
1326
|
+
extern "C" {
|
1327
|
+
#endif
|
1328
|
+
aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
|
1329
|
+
const aclIntArray* kernelSize,
|
1330
|
+
const aclIntArray* dilation,
|
1331
|
+
const aclIntArray* padding,
|
1332
|
+
const aclIntArray* stride,
|
1333
|
+
aclTensor* out, uint64_t* workspaceSize,
|
1334
|
+
aclOpExecutor** executor);
|
1335
|
+
aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
|
1336
|
+
aclOpExecutor* executor, aclrtStream stream);
|
1337
|
+
#ifdef __cplusplus
|
1338
|
+
}
|
1339
|
+
#endif
|
1340
|
+
|
1341
|
+
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
|
1342
|
+
ggml_tensor* dst,
|
1343
|
+
ggml_tensor* src1,
|
1344
|
+
aclTensor* tmp_cast_tensor,
|
1345
|
+
aclTensor* tmp_im2col_tensor) {
|
1346
|
+
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
1347
|
+
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
1348
|
+
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
1349
|
+
aclTensor* acl_dst =
|
1350
|
+
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
1351
|
+
|
1352
|
+
int64_t permute_dim[] = {0, 2, 1};
|
1353
|
+
if (src1->type != dst->type) {
|
1354
|
+
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
1355
|
+
} else {
|
1356
|
+
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
1357
|
+
}
|
1358
|
+
|
1359
|
+
// release
|
1360
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1361
|
+
}
|
1362
|
+
|
1363
|
+
static void ggml_cann_im2col_1d_post_process(
|
1364
|
+
ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
|
1365
|
+
aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
|
1366
|
+
const std::vector<int64_t>& im2col_op_params) {
|
1367
|
+
// get params
|
1368
|
+
const int64_t KH = im2col_op_params[0];
|
1369
|
+
const int64_t KW = im2col_op_params[1];
|
1370
|
+
const int64_t IW = im2col_op_params[2];
|
1371
|
+
const int64_t IC = im2col_op_params[3];
|
1372
|
+
const int64_t N = im2col_op_params[4];
|
1373
|
+
const int64_t OH = im2col_op_params[5];
|
1374
|
+
const int64_t OW = im2col_op_params[6];
|
1375
|
+
const int64_t s0 = im2col_op_params[7];
|
1376
|
+
const int64_t p0 = im2col_op_params[8];
|
1377
|
+
const int64_t d0 = im2col_op_params[9];
|
1378
|
+
const int64_t n_bytes_factor = im2col_op_params[10];
|
1379
|
+
|
1380
|
+
// Permute: [N, IC * KH * KW, OW * OH] ->
|
1381
|
+
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
|
1382
|
+
aclTensor* tmp_permute_tensor = nullptr;
|
1383
|
+
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
|
1384
|
+
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
1385
|
+
void* tmp_permute_buffer = tmp_permute_allocator.get();
|
1386
|
+
|
1387
|
+
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
|
1388
|
+
size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
|
1389
|
+
tmp_permute_nb[0] = ggml_type_size(dst->type);
|
1390
|
+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
1391
|
+
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
1392
|
+
}
|
1393
|
+
|
1394
|
+
tmp_permute_tensor = ggml_cann_create_tensor(
|
1395
|
+
tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
|
1396
|
+
ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
|
1397
|
+
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
1398
|
+
|
1399
|
+
int64_t permute_dim[] = {0, 2, 1};
|
1400
|
+
if (src1->type != dst->type) {
|
1401
|
+
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
|
1402
|
+
} else {
|
1403
|
+
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
|
1404
|
+
3);
|
1405
|
+
}
|
1406
|
+
|
1407
|
+
// number of times the kernel moves in W dimension
|
1408
|
+
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
|
1409
|
+
size_t offset;
|
1410
|
+
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
1411
|
+
|
1412
|
+
// memory copy with offset to restore 1D im2col from 2d
|
1413
|
+
if (IC > 1) {
|
1414
|
+
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
|
1415
|
+
size_t size_cpy = KH * KW * ggml_type_size(dst->type);
|
1416
|
+
|
1417
|
+
for (int c = 0; c < IC; c++) {
|
1418
|
+
cur_permute_buffer = (char*)tmp_permute_buffer + offset +
|
1419
|
+
KH * KW * c * ggml_type_size(dst->type);
|
1420
|
+
cur_dst_buffer = (char*)dst->data +
|
1421
|
+
c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
1422
|
+
|
1423
|
+
for (int i = 0; i < n_step_w; i++) {
|
1424
|
+
ACL_CHECK(aclrtMemcpyAsync(
|
1425
|
+
cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
|
1426
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
1427
|
+
cur_dst_buffer =
|
1428
|
+
(char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
|
1429
|
+
cur_permute_buffer = (char*)cur_permute_buffer +
|
1430
|
+
KH * KW * IC * ggml_type_size(dst->type);
|
1431
|
+
}
|
1432
|
+
}
|
1433
|
+
} else {
|
1434
|
+
offset = KH * KW * n_step_w *
|
1435
|
+
ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
|
1436
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
|
1437
|
+
(char*)tmp_permute_buffer + offset, offset,
|
1438
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
1439
|
+
}
|
1440
|
+
|
1441
|
+
// release
|
1442
|
+
ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
1446
|
+
ggml_tensor* src0 = dst->src[0]; // kernel
|
1447
|
+
ggml_tensor* src1 = dst->src[1]; // input
|
1448
|
+
|
1449
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
1450
|
+
|
1451
|
+
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
1452
|
+
// im2col and do post-processing to restore it to 1D.
|
1453
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
1454
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
1455
|
+
const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
|
1456
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
1457
|
+
const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
|
1458
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
1459
|
+
const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
|
1460
|
+
|
1461
|
+
const int64_t N = ne13;
|
1462
|
+
const int64_t IC = ne12;
|
1463
|
+
const int64_t KH = ne01;
|
1464
|
+
const int64_t KW = ne00;
|
1465
|
+
const int64_t IW = ne10;
|
1466
|
+
|
1467
|
+
const int64_t OH = is_2D ? ne2 : 1;
|
1468
|
+
const int64_t OW = ne1;
|
1469
|
+
|
1470
|
+
// memory allocated increased to 3x when is_2D == false
|
1471
|
+
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
1472
|
+
|
1473
|
+
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
|
1474
|
+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
1475
|
+
int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
|
1476
|
+
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
1477
|
+
|
1478
|
+
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
1479
|
+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
1480
|
+
tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
|
1481
|
+
}
|
1482
|
+
|
1483
|
+
// Calculate im2col.
|
1484
|
+
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
1485
|
+
// dst.elemcount.
|
1486
|
+
ggml_cann_pool_alloc im2col_allocator(
|
1487
|
+
ctx.pool(),
|
1488
|
+
ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
|
1489
|
+
void* tmp_im2col_buffer = im2col_allocator.get();
|
1490
|
+
|
1491
|
+
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
1492
|
+
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
1493
|
+
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
1494
|
+
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
1495
|
+
|
1496
|
+
std::vector<int64_t> kernel_dims = {KH, KW};
|
1497
|
+
std::vector<int64_t> dilation_size = {d1, d0};
|
1498
|
+
std::vector<int64_t> padding_dims = {p1, p0};
|
1499
|
+
std::vector<int64_t> stride_dims = {s1, s0};
|
1500
|
+
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
|
1501
|
+
auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
|
1502
|
+
auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
|
1503
|
+
auto* strides = aclCreateIntArray(stride_dims.data(), 2);
|
1504
|
+
|
1505
|
+
uint64_t workspaceSize = 0;
|
1506
|
+
aclOpExecutor* executor;
|
1507
|
+
void* workspaceAddr = nullptr;
|
1508
|
+
|
1509
|
+
ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
|
1510
|
+
paddings, strides, tmp_im2col_tensor,
|
1511
|
+
&workspaceSize, &executor));
|
1512
|
+
|
1513
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
1514
|
+
if (workspaceSize > 0) {
|
1515
|
+
workspace_allocator.alloc(workspaceSize);
|
1516
|
+
workspaceAddr = workspace_allocator.get();
|
1517
|
+
}
|
1518
|
+
|
1519
|
+
ACL_CHECK(
|
1520
|
+
aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1521
|
+
|
1522
|
+
// Cast if dst is f16.
|
1523
|
+
aclTensor* tmp_cast_tensor = nullptr;
|
1524
|
+
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
1525
|
+
void* tmp_cast_buffer = nullptr;
|
1526
|
+
if (src1->type != dst->type) {
|
1527
|
+
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
1528
|
+
tmp_cast_buffer = tmp_cast_allocator.get();
|
1529
|
+
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
1530
|
+
temp_cast_nb[0] = ggml_type_size(dst->type);
|
1531
|
+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
1532
|
+
temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
|
1533
|
+
}
|
1534
|
+
|
1535
|
+
tmp_cast_tensor = ggml_cann_create_tensor(
|
1536
|
+
tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
|
1537
|
+
ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
|
1538
|
+
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
1539
|
+
aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
|
1540
|
+
ggml_cann_type_mapping(dst->type));
|
1541
|
+
}
|
1542
|
+
|
1543
|
+
// post-processing
|
1544
|
+
if (is_2D) {
|
1545
|
+
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
1546
|
+
tmp_im2col_tensor);
|
1547
|
+
} else {
|
1548
|
+
std::vector<int64_t> im2col_op_params = {
|
1549
|
+
KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
|
1550
|
+
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
1551
|
+
tmp_im2col_tensor, im2col_op_params);
|
1552
|
+
}
|
1553
|
+
|
1554
|
+
// release
|
1555
|
+
ACL_CHECK(aclDestroyTensor(acl_src1));
|
1556
|
+
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
1557
|
+
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
1558
|
+
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
1559
|
+
ACL_CHECK(aclDestroyIntArray(dilations));
|
1560
|
+
ACL_CHECK(aclDestroyIntArray(paddings));
|
1561
|
+
ACL_CHECK(aclDestroyIntArray(strides));
|
1562
|
+
}
|
1563
|
+
|
1564
|
+
/**
|
1565
|
+
* @brief Applies element-wise exponential function to the elements of a tensor.
|
1566
|
+
*
|
1567
|
+
* This function computes the exponential of each element in the source tensor
|
1568
|
+
* `acl_src` and stores the result back into the same tensor.
|
1569
|
+
* The operation is defined as:
|
1570
|
+
* \f[
|
1571
|
+
* \text {acl_src }_i=e^{acl\_src_i}
|
1572
|
+
* \f]
|
1573
|
+
*
|
1574
|
+
* @param ctx The context for the CANN backend operations.
|
1575
|
+
* @param acl_src The tensor on which the exponential function will be applied.
|
1576
|
+
*/
|
1577
|
+
static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
|
1578
|
+
uint64_t workspaceSize = 0;
|
1579
|
+
aclOpExecutor* executor;
|
1580
|
+
void* workspaceAddr = nullptr;
|
1581
|
+
|
1582
|
+
ACL_CHECK(
|
1583
|
+
aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor));
|
1584
|
+
if (workspaceSize > 0) {
|
1585
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1586
|
+
workspaceAddr = workspace_allocator.get();
|
1587
|
+
}
|
1588
|
+
|
1589
|
+
ACL_CHECK(
|
1590
|
+
aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1591
|
+
}
|
1592
|
+
|
1593
|
+
/**
|
1594
|
+
* @brief Multiplies elements of a tensor by a scalar value, optionally
|
1595
|
+
* in-place.
|
1596
|
+
*
|
1597
|
+
* This function multiplies each element of the source tensor `acl_src` by the
|
1598
|
+
* scalar `scale` and stores the result in the destination tensor `acl_dst`. If
|
1599
|
+
* `inplace` is true, `acl_dst` will not be used and the operation is performed
|
1600
|
+
* in-place on `acl_src`.
|
1601
|
+
* The operation is defined as:
|
1602
|
+
* \f[
|
1603
|
+
* \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
|
1604
|
+
* \f]
|
1605
|
+
*
|
1606
|
+
* @param ctx The context for the CANN backend operations.
|
1607
|
+
* @param acl_src The source tensor whose elements will be multiplied.
|
1608
|
+
* @param scale The scalar value by which each element of `acl_src` will be
|
1609
|
+
* multiplied.
|
1610
|
+
* @param acl_dst The destination tensor where the result will be stored if
|
1611
|
+
* `inplace` is false.
|
1612
|
+
* @param inplace Flag indicating whether to perform the operation in-place on
|
1613
|
+
* `acl_src`.
|
1614
|
+
*/
|
1615
|
+
static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1616
|
+
float scale, aclTensor* acl_dst, bool inplace) {
|
1617
|
+
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
|
1618
|
+
|
1619
|
+
uint64_t workspaceSize = 0;
|
1620
|
+
aclOpExecutor* executor;
|
1621
|
+
void* workspaceAddr = nullptr;
|
1622
|
+
|
1623
|
+
if (inplace) {
|
1624
|
+
ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
|
1625
|
+
&workspaceSize, &executor));
|
1626
|
+
if (workspaceSize > 0) {
|
1627
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1628
|
+
workspaceAddr = workspace_allocator.get();
|
1629
|
+
}
|
1630
|
+
|
1631
|
+
ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
|
1632
|
+
ctx.stream()));
|
1633
|
+
} else {
|
1634
|
+
ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
|
1635
|
+
&workspaceSize, &executor));
|
1636
|
+
if (workspaceSize > 0) {
|
1637
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1638
|
+
workspaceAddr = workspace_allocator.get();
|
1639
|
+
}
|
1640
|
+
|
1641
|
+
ACL_CHECK(
|
1642
|
+
aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1643
|
+
}
|
1644
|
+
|
1645
|
+
ACL_CHECK(aclDestroyScalar(acl_scale));
|
1646
|
+
}
|
1647
|
+
|
1648
|
+
/**
|
1649
|
+
* @brief Performs an in-place element-wise multiplication of two tensors.
|
1650
|
+
*
|
1651
|
+
* This function performs an element-wise multiplication of the tensors
|
1652
|
+
* `acl_src` and `acl_other` and stores the result in `acl_src`.
|
1653
|
+
* The operation is defined as:
|
1654
|
+
* \f[
|
1655
|
+
* \text {acl_src }_i=\text {acl_src }_i \times \text {acl_other }_i
|
1656
|
+
* \f]
|
1657
|
+
*
|
1658
|
+
* @param ctx The context for the CANN backend operations.
|
1659
|
+
* @param acl_src The source tensor where the multiplication result will be
|
1660
|
+
* stored.
|
1661
|
+
* @param acl_other The tensor whose elements will be multiplied with `acl_src`.
|
1662
|
+
*/
|
1663
|
+
static void aclnn_inplace_mul(ggml_backend_cann_context& ctx,
|
1664
|
+
aclTensor* acl_src, aclTensor* acl_other) {
|
1665
|
+
uint64_t workspaceSize = 0;
|
1666
|
+
aclOpExecutor* executor;
|
1667
|
+
void* workspaceAddr = nullptr;
|
1668
|
+
|
1669
|
+
ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
|
1670
|
+
&workspaceSize, &executor));
|
1671
|
+
if (workspaceSize > 0) {
|
1672
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1673
|
+
workspaceAddr = workspace_allocator.get();
|
1674
|
+
}
|
1675
|
+
|
1676
|
+
ACL_CHECK(
|
1677
|
+
aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1678
|
+
}
|
1679
|
+
|
1680
|
+
/**
|
1681
|
+
* @brief Performs element-wise multiplication of two tensors and stores the
|
1682
|
+
* result in a destination tensor.
|
1683
|
+
*
|
1684
|
+
* This function performs element-wise multiplication of the tensors `acl_src`
|
1685
|
+
* and `acl_other` and stores the result in the destination tensor `acl_dst`.
|
1686
|
+
* The operation is defined as:
|
1687
|
+
* \f[
|
1688
|
+
* \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
|
1689
|
+
* \f]
|
1690
|
+
*
|
1691
|
+
* @param ctx The context for the CANN backend operations.
|
1692
|
+
* @param acl_src The first tensor for element-wise multiplication.
|
1693
|
+
* @param acl_other The second tensor for element-wise multiplication.
|
1694
|
+
* @param acl_dst The destination tensor where the result will be stored.
|
1695
|
+
*/
|
1696
|
+
static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1697
|
+
aclTensor* acl_other, aclTensor* acl_dst) {
|
1698
|
+
uint64_t workspaceSize = 0;
|
1699
|
+
aclOpExecutor* executor;
|
1700
|
+
void* workspaceAddr = nullptr;
|
1701
|
+
|
1702
|
+
ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
|
1703
|
+
&workspaceSize, &executor));
|
1704
|
+
if (workspaceSize > 0) {
|
1705
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1706
|
+
workspaceAddr = workspace_allocator.get();
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1710
|
+
}
|
1711
|
+
|
1712
|
+
/**
|
1713
|
+
* @brief Applies element-wise cosine function to the elements of a tensor.
|
1714
|
+
*
|
1715
|
+
* This function computes the cosine of each element in the source tensor
|
1716
|
+
* `acl_src` and stores the result in the destination tensor `acl_dst`. The
|
1717
|
+
* operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
|
1718
|
+
* }_i\right) \f]
|
1719
|
+
*
|
1720
|
+
* @param ctx The context for the CANN backend operations.
|
1721
|
+
* @param acl_src The source tensor on which the cosine function will be
|
1722
|
+
* applied.
|
1723
|
+
* @param acl_dst The destination tensor where the cosine results will be
|
1724
|
+
* stored.
|
1725
|
+
*/
|
1726
|
+
static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1727
|
+
aclTensor* acl_dst) {
|
1728
|
+
uint64_t workspaceSize = 0;
|
1729
|
+
aclOpExecutor* executor;
|
1730
|
+
void* workspaceAddr = nullptr;
|
1731
|
+
|
1732
|
+
ACL_CHECK(
|
1733
|
+
aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
|
1734
|
+
if (workspaceSize > 0) {
|
1735
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1736
|
+
workspaceAddr = workspace_allocator.get();
|
1737
|
+
}
|
1738
|
+
|
1739
|
+
ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1740
|
+
}
|
1741
|
+
|
1742
|
+
/**
|
1743
|
+
* @brief Applies element-wise sine function to the elements of a tensor.
|
1744
|
+
*
|
1745
|
+
* This function computes the sine of each element in the source tensor
|
1746
|
+
`acl_src`
|
1747
|
+
* and stores the result in the destination tensor `acl_dst`.
|
1748
|
+
* The operation is defined as:
|
1749
|
+
* \f[
|
1750
|
+
* \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
|
1751
|
+
* \f]
|
1752
|
+
|
1753
|
+
* @param ctx The context for the CANN backend operations.
|
1754
|
+
* @param acl_src The source tensor on which the sine function will be applied.
|
1755
|
+
* @param acl_dst The destination tensor where the sine results will be stored.
|
1756
|
+
*/
|
1757
|
+
static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1758
|
+
aclTensor* acl_dst) {
|
1759
|
+
uint64_t workspaceSize = 0;
|
1760
|
+
aclOpExecutor* executor;
|
1761
|
+
void* workspaceAddr = nullptr;
|
1762
|
+
|
1763
|
+
ACL_CHECK(
|
1764
|
+
aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
|
1765
|
+
if (workspaceSize > 0) {
|
1766
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1767
|
+
workspaceAddr = workspace_allocator.get();
|
1768
|
+
}
|
1769
|
+
|
1770
|
+
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1771
|
+
}
|
1772
|
+
|
1773
|
+
/**
|
1774
|
+
* @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
|
1775
|
+
result by the scalar value and adds it to self .
|
1776
|
+
*
|
1777
|
+
* Performs element-wise division of tensor1 by tensor2,
|
1778
|
+
* multiplies the result by the scalar value and adds it to self .
|
1779
|
+
* The operation is defined as:
|
1780
|
+
* \f[
|
1781
|
+
* \text{out}_i = \text{selft}_i + \text{value} \times
|
1782
|
+
\frac{\text{tensor1}_i}{\text{tensor2}_i}
|
1783
|
+
* \f]
|
1784
|
+
|
1785
|
+
* @param ctx The context for the CANN backend operations.
|
1786
|
+
* @param acl_self The source tensor on which the addcdiv function will be
|
1787
|
+
applied.
|
1788
|
+
* @param tensor1 Numerator tensor.
|
1789
|
+
* @param tensor2 Denominator tensor.
|
1790
|
+
* @param value The value to be used for coefficient.
|
1791
|
+
*/
|
1792
|
+
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
|
1793
|
+
aclTensor* acl_self, aclTensor* tensor1,
|
1794
|
+
aclTensor* tensor2, float value) {
|
1795
|
+
uint64_t workspaceSize = 0;
|
1796
|
+
aclOpExecutor* executor;
|
1797
|
+
void* workspaceAddr = nullptr;
|
1798
|
+
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
|
1799
|
+
|
1800
|
+
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
|
1801
|
+
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
|
1802
|
+
if (workspaceSize > 0) {
|
1803
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1804
|
+
workspaceAddr = workspace_allocator.get();
|
1805
|
+
}
|
1806
|
+
|
1807
|
+
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
|
1808
|
+
ctx.stream()));
|
1809
|
+
}
|
1810
|
+
|
1811
|
+
/**
|
1812
|
+
* @brief Matrix division, optionally in-place.
|
1813
|
+
*
|
1814
|
+
* This function division each element of the source tensor `acl_src` by the
|
1815
|
+
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
|
1816
|
+
* If `inplace` is true, `acl_dst` will not be used and the operation is
|
1817
|
+
* performed in-place on `acl_src`. The operation is defined as: \f[
|
1818
|
+
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
|
1819
|
+
* \f]
|
1820
|
+
*
|
1821
|
+
* @param ctx The context for the CANN backend operations.
|
1822
|
+
* @param acl_src Numerator tensor..
|
1823
|
+
* @param acl_other Denominator tensor.
|
1824
|
+
* @param acl_dst The destination tensor where the result will be stored if
|
1825
|
+
* `inplace` is false.
|
1826
|
+
* @param inplace Flag indicating whether to perform the operation in-place on
|
1827
|
+
* `acl_src`.
|
1828
|
+
*/
|
1829
|
+
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
1830
|
+
aclTensor* acl_other, aclTensor* acl_dst,
|
1831
|
+
bool inplace) {
|
1832
|
+
uint64_t workspaceSize = 0;
|
1833
|
+
aclOpExecutor* executor;
|
1834
|
+
void* workspaceAddr = nullptr;
|
1835
|
+
|
1836
|
+
if (inplace) {
|
1837
|
+
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
|
1838
|
+
&workspaceSize, &executor));
|
1839
|
+
if (workspaceSize > 0) {
|
1840
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1841
|
+
workspaceAddr = workspace_allocator.get();
|
1842
|
+
}
|
1843
|
+
|
1844
|
+
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
|
1845
|
+
ctx.stream()));
|
1846
|
+
} else {
|
1847
|
+
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
|
1848
|
+
&workspaceSize, &executor));
|
1849
|
+
if (workspaceSize > 0) {
|
1850
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1851
|
+
workspaceAddr = workspace_allocator.get();
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
ACL_CHECK(
|
1855
|
+
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
1856
|
+
}
|
1857
|
+
}
|
1858
|
+
|
1859
|
+
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
|
1860
|
+
ggml_tensor* dst) {
|
1861
|
+
const ggml_tensor* src = dst->src[0];
|
1862
|
+
|
1863
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
1864
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
1865
|
+
|
1866
|
+
const int dim = dst->op_params[0];
|
1867
|
+
const int max_period = dst->op_params[1];
|
1868
|
+
int half = dim / 2;
|
1869
|
+
|
1870
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
1871
|
+
|
1872
|
+
// arange: [0, ..., half)
|
1873
|
+
float start = 0;
|
1874
|
+
float stop = half;
|
1875
|
+
float step = 1;
|
1876
|
+
int64_t n_elements_arange = half;
|
1877
|
+
int64_t tmp_arange_ne[] = {half};
|
1878
|
+
size_t tmp_arange_nb[] = {sizeof(dst->type)};
|
1879
|
+
|
1880
|
+
ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
|
1881
|
+
void* tmp_arange_buffer = arange_allocator.get();
|
1882
|
+
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
1883
|
+
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
1884
|
+
ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
|
1885
|
+
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
1886
|
+
|
1887
|
+
aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
|
1888
|
+
|
1889
|
+
// freq
|
1890
|
+
float freq_param = -logf(max_period) / half;
|
1891
|
+
bool inplace = true;
|
1892
|
+
aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
|
1893
|
+
aclnn_exp(ctx, tmp_arange_tensor);
|
1894
|
+
|
1895
|
+
// permute: src [0,1,2,3]->[0,1,3,2]
|
1896
|
+
int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
|
1897
|
+
size_t tmp_permute_nb[GGML_MAX_DIMS];
|
1898
|
+
tmp_permute_nb[0] = ggml_type_size(src->type);
|
1899
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
1900
|
+
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
1901
|
+
}
|
1902
|
+
|
1903
|
+
ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
|
1904
|
+
void* tmp_permute_buffer = permute_allocator.get();
|
1905
|
+
aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor(
|
1906
|
+
tmp_permute_buffer, ggml_cann_type_mapping(src->type),
|
1907
|
+
ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
|
1908
|
+
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
1909
|
+
int64_t permute_dim[] = {0, 1, 3, 2};
|
1910
|
+
int64_t num_dims = 4;
|
1911
|
+
aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims);
|
1912
|
+
|
1913
|
+
// timestep * freq
|
1914
|
+
int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
|
1915
|
+
src->ne[3]};
|
1916
|
+
size_t tmp_mul_nb[GGML_MAX_DIMS];
|
1917
|
+
tmp_mul_nb[0] = ggml_type_size(src->type);
|
1918
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
1919
|
+
tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
|
1920
|
+
}
|
1921
|
+
|
1922
|
+
int mul_nelements =
|
1923
|
+
src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
|
1924
|
+
|
1925
|
+
ggml_cann_pool_alloc mul_allocator(
|
1926
|
+
ctx.pool(), mul_nelements * ggml_type_size(src->type));
|
1927
|
+
void* tmp_mul_buffer = mul_allocator.get();
|
1928
|
+
aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
|
1929
|
+
tmp_mul_buffer, ggml_cann_type_mapping(src->type),
|
1930
|
+
ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
|
1931
|
+
ACL_FORMAT_ND);
|
1932
|
+
aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor);
|
1933
|
+
|
1934
|
+
// cos
|
1935
|
+
ggml_cann_pool_alloc cos_allocator(
|
1936
|
+
ctx.pool(), mul_nelements * ggml_type_size(src->type));
|
1937
|
+
void* tmp_cos_buffer = cos_allocator.get();
|
1938
|
+
aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
|
1939
|
+
tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
|
1940
|
+
ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
|
1941
|
+
ACL_FORMAT_ND);
|
1942
|
+
|
1943
|
+
aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
|
1944
|
+
|
1945
|
+
// sin
|
1946
|
+
ggml_cann_pool_alloc sin_allocator(
|
1947
|
+
ctx.pool(), mul_nelements * ggml_type_size(src->type));
|
1948
|
+
void* tmp_sin_buffer = sin_allocator.get();
|
1949
|
+
aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
|
1950
|
+
tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
|
1951
|
+
ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
|
1952
|
+
ACL_FORMAT_ND);
|
1953
|
+
|
1954
|
+
aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
|
1955
|
+
|
1956
|
+
// concat
|
1957
|
+
int64_t concat_dim = 3;
|
1958
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
1959
|
+
aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
|
1960
|
+
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
|
1961
|
+
aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
|
1962
|
+
|
1963
|
+
// release
|
1964
|
+
// segmentation fault when delete both tensorList and his elements.
|
1965
|
+
ACL_CHECK(aclDestroyTensorList(tensorList));
|
1966
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
1967
|
+
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
|
1968
|
+
ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
|
1969
|
+
ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
|
1970
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
1971
|
+
}
|
1972
|
+
|
1973
|
+
/**
|
1974
|
+
* @brief Fills a tensor with a scalar value.
|
1975
|
+
*
|
1976
|
+
* This function fills the destination tensor `acl_dst` with the scalar value
|
1977
|
+
* `scalar`.
|
1978
|
+
*
|
1979
|
+
* @param ctx The context for the CANN backend operations.
|
1980
|
+
* @param scalar The scalar value used to fill the tensor.
|
1981
|
+
* @param acl_dst The destination tensor to be filled with the scalar value.
|
1982
|
+
*/
|
1983
|
+
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
|
1984
|
+
aclTensor* acl_dst) {
|
1985
|
+
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
|
1986
|
+
|
1987
|
+
uint64_t workspaceSize = 0;
|
1988
|
+
aclOpExecutor* executor;
|
1989
|
+
void* workspaceAddr = nullptr;
|
1990
|
+
|
1991
|
+
ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(
|
1992
|
+
acl_dst, acl_scalar, &workspaceSize, &executor));
|
1993
|
+
if (workspaceSize > 0) {
|
1994
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
1995
|
+
workspaceAddr = workspace_allocator.get();
|
1996
|
+
}
|
1997
|
+
|
1998
|
+
ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
|
1999
|
+
ctx.stream()));
|
2000
|
+
ACL_CHECK(aclDestroyScalar(acl_scalar));
|
2001
|
+
}
|
2002
|
+
|
2003
|
+
/**
|
2004
|
+
* @brief Raises each element of a tensor to the power of the corresponding
|
2005
|
+
* element in another tensor.
|
2006
|
+
*
|
2007
|
+
* This function computes the element-wise power of the destination tensor
|
2008
|
+
* `acl_dst` raised to the power of the exponent tensor `acl_exp`.
|
2009
|
+
* The operation is defined as:
|
2010
|
+
* \f[
|
2011
|
+
* \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
|
2012
|
+
* \f]
|
2013
|
+
*
|
2014
|
+
* @param ctx The context for the CANN backend operations.
|
2015
|
+
* @param acl_dst The destination tensor, which also serves as the base tensor.
|
2016
|
+
* @param acl_exp The exponent tensor, each element of which is used to raise
|
2017
|
+
* the corresponding element in the destination tensor.
|
2018
|
+
*/
|
2019
|
+
static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
2020
|
+
aclTensor* acl_dst, aclTensor* acl_exp) {
|
2021
|
+
uint64_t workspaceSize = 0;
|
2022
|
+
aclOpExecutor* executor;
|
2023
|
+
void* workspaceAddr = nullptr;
|
2024
|
+
|
2025
|
+
ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(
|
2026
|
+
acl_dst, acl_exp, &workspaceSize, &executor));
|
2027
|
+
if (workspaceSize > 0) {
|
2028
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2029
|
+
workspaceAddr = workspace_allocator.get();
|
2030
|
+
}
|
2031
|
+
|
2032
|
+
ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
|
2033
|
+
executor, ctx.stream()));
|
2034
|
+
}
|
2035
|
+
|
2036
|
+
/**
|
2037
|
+
* @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
|
2038
|
+
* @details This function implements the Alibi mechanism, which introduces
|
2039
|
+
* learnable biases into the attention scores to simulate relative
|
2040
|
+
* position encoding without the need for explicit positional
|
2041
|
+
* embeddings.
|
2042
|
+
*
|
2043
|
+
* @param ctx The backend CANN context for executing operations.
|
2044
|
+
* @param acl_src The source tensor representing the query or key.
|
2045
|
+
* @param acl_position The position tensor containing relative positions.
|
2046
|
+
* @param acl_dst The destination tensor where the result will be stored.
|
2047
|
+
* @param n_head The number of attention heads.
|
2048
|
+
* @param src_ne The dimensions of the source tensor.
|
2049
|
+
* @param src_nb0 The byte size of the first dimension of the source
|
2050
|
+
tensor.
|
2051
|
+
* @param max_bias The maximum bias value used in the Alibi mechanism.
|
2052
|
+
* @param dst The destination tensor object for additional metadata.
|
2053
|
+
*
|
2054
|
+
* The function performs the following steps:
|
2055
|
+
* 1. Calculates the logarithm floor of the number of heads to determine the
|
2056
|
+
base for bias calculation.
|
2057
|
+
* 2. Initializes arrays with arithmetic sequences and fills them with bias
|
2058
|
+
values.
|
2059
|
+
* 3. Computes the bias tensor based on the calculated biases and arithmetic
|
2060
|
+
sequences.
|
2061
|
+
* 4. Reshapes the bias tensor to match the dimensions of the input tensors.
|
2062
|
+
* 5. Multiplies the position tensor by the bias tensor.
|
2063
|
+
* 6. Adds the result of the multiplication to the source tensor to produce the
|
2064
|
+
final output.
|
2065
|
+
*/
|
2066
|
+
static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
2067
|
+
aclTensor* acl_position, aclTensor* acl_dst,
|
2068
|
+
const int n_head, int64_t* src_ne, const size_t src_nb0,
|
2069
|
+
float max_bias, ggml_tensor* dst) {
|
2070
|
+
const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
|
2071
|
+
GGML_ASSERT(src_nb0 == sizeof(float));
|
2072
|
+
GGML_ASSERT(n_head == src_ne[2]);
|
2073
|
+
|
2074
|
+
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
2075
|
+
|
2076
|
+
float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
2077
|
+
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
2078
|
+
|
2079
|
+
// init arange
|
2080
|
+
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
2081
|
+
ne2_ne3 * ggml_type_size(dst->type));
|
2082
|
+
void* tmp_arange_buffer = arange_allocator.get();
|
2083
|
+
|
2084
|
+
// arange1: [1, ..., n_heads_log2_floor+1)
|
2085
|
+
float start = 1;
|
2086
|
+
float stop = n_heads_log2_floor + 1;
|
2087
|
+
float step = 1;
|
2088
|
+
int64_t n_elements_arange = n_heads_log2_floor;
|
2089
|
+
|
2090
|
+
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
2091
|
+
size_t tmp_arange1_nb[] = {sizeof(dst->type)};
|
2092
|
+
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
2093
|
+
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
2094
|
+
ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
|
2095
|
+
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
2096
|
+
|
2097
|
+
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
2098
|
+
|
2099
|
+
aclTensor* tmp_arange2_tensor = nullptr;
|
2100
|
+
if (n_heads_log2_floor < ne2_ne3) {
|
2101
|
+
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
2102
|
+
start = 1;
|
2103
|
+
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
2104
|
+
step = 2;
|
2105
|
+
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
2106
|
+
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
2107
|
+
size_t tmp_arange2_nb[] = {sizeof(dst->type)};
|
2108
|
+
|
2109
|
+
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
2110
|
+
(char*)tmp_arange_buffer +
|
2111
|
+
n_heads_log2_floor * ggml_type_size(dst->type),
|
2112
|
+
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
2113
|
+
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
2114
|
+
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
2115
|
+
n_elements_arange);
|
2116
|
+
}
|
2117
|
+
|
2118
|
+
// init mk_base
|
2119
|
+
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
2120
|
+
ne2_ne3 * ggml_type_size(dst->type));
|
2121
|
+
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
2122
|
+
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
2123
|
+
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
|
2124
|
+
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
2125
|
+
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
2126
|
+
ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
|
2127
|
+
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
2128
|
+
|
2129
|
+
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
2130
|
+
|
2131
|
+
aclTensor* tmp_mk_base2_tensor = nullptr;
|
2132
|
+
if (n_heads_log2_floor < ne2_ne3) {
|
2133
|
+
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
2134
|
+
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
|
2135
|
+
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
2136
|
+
(char*)tmp_mk_base_buffer +
|
2137
|
+
n_heads_log2_floor * ggml_type_size(dst->type),
|
2138
|
+
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
2139
|
+
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
2140
|
+
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
2141
|
+
}
|
2142
|
+
|
2143
|
+
// init mk
|
2144
|
+
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
2145
|
+
size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
|
2146
|
+
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
2147
|
+
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
2148
|
+
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
2149
|
+
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
2150
|
+
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
2151
|
+
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
2152
|
+
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
2153
|
+
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
2154
|
+
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
2155
|
+
|
2156
|
+
// reshape mk
|
2157
|
+
int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
|
2158
|
+
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
2159
|
+
tmp_mk_nb[0] = ggml_type_size(dst->type);
|
2160
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
2161
|
+
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
2162
|
+
}
|
2163
|
+
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
2164
|
+
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
2165
|
+
ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
2166
|
+
ACL_FORMAT_ND);
|
2167
|
+
|
2168
|
+
// acl_position * mk
|
2169
|
+
int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
|
2170
|
+
size_t tmp_output_nb[GGML_MAX_DIMS];
|
2171
|
+
tmp_output_nb[0] = ggml_type_size(dst->type);
|
2172
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
2173
|
+
tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
|
2174
|
+
}
|
2175
|
+
ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
|
2176
|
+
void* tmp_output_buffer = output_allocator.get();
|
2177
|
+
aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
|
2178
|
+
tmp_output_buffer, ggml_cann_type_mapping(dst->type),
|
2179
|
+
ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
|
2180
|
+
ACL_FORMAT_ND);
|
2181
|
+
aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
|
2182
|
+
|
2183
|
+
// add
|
2184
|
+
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
|
2185
|
+
|
2186
|
+
ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
|
2187
|
+
ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
|
2188
|
+
ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
|
2189
|
+
ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
|
2190
|
+
ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
|
2191
|
+
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
|
2192
|
+
ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
|
2193
|
+
ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
|
2194
|
+
}
|
2195
|
+
|
2196
|
+
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
2197
|
+
ggml_cann_dup(ctx, dst);
|
2198
|
+
}
|
2199
|
+
|
2200
|
+
/**
|
2201
|
+
* @brief Performs element-wise addition of two tensors in place.
|
2202
|
+
*
|
2203
|
+
* This function adds the source tensor `acl_src` to the destination tensor
|
2204
|
+
* `acl_dst` element-wise and stores the result in the destination tensor
|
2205
|
+
* `acl_dst`.
|
2206
|
+
*
|
2207
|
+
* @param ctx The context for the CANN backend operations.
|
2208
|
+
* @param acl_src The source tensor to be added.
|
2209
|
+
* @param acl_dst The destination tensor which will hold the result of the
|
2210
|
+
* addition.
|
2211
|
+
*/
|
2212
|
+
static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
|
2213
|
+
aclTensor* acl_src, aclTensor* acl_dst) {
|
2214
|
+
aclScalar* alpha = nullptr;
|
2215
|
+
float alphaValue = 1.0f;
|
2216
|
+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
|
2217
|
+
|
2218
|
+
uint64_t workspaceSize = 0;
|
2219
|
+
aclOpExecutor* executor;
|
2220
|
+
void* workspaceAddr = nullptr;
|
2221
|
+
|
2222
|
+
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
|
2223
|
+
&workspaceSize, &executor));
|
2224
|
+
if (workspaceSize > 0) {
|
2225
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2226
|
+
workspaceAddr = workspace_allocator.get();
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
ACL_CHECK(
|
2230
|
+
aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2231
|
+
|
2232
|
+
ACL_CHECK(aclDestroyScalar(alpha));
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
/**
|
2236
|
+
* @brief Applies the softmax function to a tensor along a specified dimension.
|
2237
|
+
*
|
2238
|
+
* This function computes the softmax of the source tensor `acl_src` along the
|
2239
|
+
* specified dimension `dim` and stores the result in the destination tensor
|
2240
|
+
* `acl_dst`.
|
2241
|
+
*
|
2242
|
+
* @param ctx The context for the CANN backend operations.
|
2243
|
+
* @param acl_src The source tensor on which the softmax function will be
|
2244
|
+
* applied.
|
2245
|
+
* @param dim The dimension along which the softmax function will be computed.
|
2246
|
+
* @param acl_dst The destination tensor where the softmax results will be
|
2247
|
+
* stored.
|
2248
|
+
*/
|
2249
|
+
static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
2250
|
+
int64_t dim, aclTensor* acl_dst) {
|
2251
|
+
uint64_t workspaceSize = 0;
|
2252
|
+
aclOpExecutor* executor;
|
2253
|
+
void* workspaceAddr = nullptr;
|
2254
|
+
|
2255
|
+
ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst,
|
2256
|
+
&workspaceSize, &executor));
|
2257
|
+
|
2258
|
+
if (workspaceSize > 0) {
|
2259
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2260
|
+
workspaceAddr = workspace_allocator.get();
|
2261
|
+
}
|
2262
|
+
|
2263
|
+
aclrtStream stream = ctx.stream();
|
2264
|
+
ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
|
2265
|
+
}
|
2266
|
+
|
2267
|
+
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
2268
|
+
ggml_tensor* src0 = dst->src[0];
|
2269
|
+
ggml_tensor* src1 = dst->src[1]; // mask
|
2270
|
+
|
2271
|
+
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
2272
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
2273
|
+
|
2274
|
+
float scale = 1.0f;
|
2275
|
+
float max_bias = 0.0f;
|
2276
|
+
|
2277
|
+
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
|
2278
|
+
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
|
2279
|
+
|
2280
|
+
// input mul scale
|
2281
|
+
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
|
2282
|
+
|
2283
|
+
size_t n_bytes = ggml_nbytes(src0);
|
2284
|
+
ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
|
2285
|
+
void* input_mul_scale_buffer = mul_scale_allocator.get();
|
2286
|
+
aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
|
2287
|
+
input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
|
2288
|
+
src0->nb, GGML_MAX_DIMS);
|
2289
|
+
|
2290
|
+
bool inplace = false;
|
2291
|
+
aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
|
2292
|
+
|
2293
|
+
// mask
|
2294
|
+
aclTensor* acl_src1_fp32_tensor = nullptr;
|
2295
|
+
aclTensor* tmp_mask_tensor = nullptr;
|
2296
|
+
ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
|
2297
|
+
if (src1) {
|
2298
|
+
const bool use_f16 = src1->type == GGML_TYPE_F16;
|
2299
|
+
if (use_f16) {
|
2300
|
+
// cast to fp32
|
2301
|
+
size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
|
2302
|
+
size_t src1_fp32_nb[GGML_MAX_DIMS];
|
2303
|
+
src1_fp32_nb[0] = sizeof(float_t);
|
2304
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
2305
|
+
src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
|
2306
|
+
}
|
2307
|
+
src1_fp32_allocator.alloc(n_bytes);
|
2308
|
+
void* src1_fp32_buffer = src1_fp32_allocator.get();
|
2309
|
+
acl_src1_fp32_tensor = ggml_cann_create_tensor(
|
2310
|
+
src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
|
2311
|
+
src1_fp32_nb, GGML_MAX_DIMS);
|
2312
|
+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
2313
|
+
aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
|
2314
|
+
|
2315
|
+
ACL_CHECK(aclDestroyTensor(acl_src1));
|
2316
|
+
} else {
|
2317
|
+
acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
|
2318
|
+
}
|
2319
|
+
|
2320
|
+
// broadcast the mask across rows, only use ne11 of ne01 in mask
|
2321
|
+
if (src1->ne[1] != src0->ne[1]) {
|
2322
|
+
// mask shape: [1,1,ne11,ne10]
|
2323
|
+
int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
|
2324
|
+
size_t tmp_mask_nb[GGML_MAX_DIMS];
|
2325
|
+
tmp_mask_nb[0] = sizeof(float_t);
|
2326
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
2327
|
+
tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
|
2328
|
+
}
|
2329
|
+
tmp_mask_tensor = ggml_cann_create_tensor(
|
2330
|
+
src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
|
2331
|
+
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
2332
|
+
}
|
2333
|
+
|
2334
|
+
// alibi
|
2335
|
+
const int n_head = src0->ne[2];
|
2336
|
+
const size_t src_nb0 = src0->nb[0];
|
2337
|
+
|
2338
|
+
n_bytes = ggml_nbytes(dst);
|
2339
|
+
ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
|
2340
|
+
void* output_buffer = output_allocator.get();
|
2341
|
+
aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
|
2342
|
+
output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
|
2343
|
+
dst->nb, GGML_MAX_DIMS);
|
2344
|
+
if (max_bias <= 0.0f) {
|
2345
|
+
// slope = 1.0
|
2346
|
+
if (tmp_mask_tensor) {
|
2347
|
+
aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
|
2348
|
+
alibi_output_tensor);
|
2349
|
+
} else {
|
2350
|
+
aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
|
2351
|
+
alibi_output_tensor);
|
2352
|
+
}
|
2353
|
+
} else {
|
2354
|
+
// slope != 1.0
|
2355
|
+
if (tmp_mask_tensor) {
|
2356
|
+
aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
|
2357
|
+
alibi_output_tensor, n_head, src0->ne, src_nb0,
|
2358
|
+
max_bias, dst);
|
2359
|
+
} else {
|
2360
|
+
aclnn_alibi(ctx, acl_input_mul_scale_tensor,
|
2361
|
+
acl_src1_fp32_tensor, alibi_output_tensor, n_head,
|
2362
|
+
src0->ne, src_nb0, max_bias, dst);
|
2363
|
+
}
|
2364
|
+
}
|
2365
|
+
|
2366
|
+
// softmax
|
2367
|
+
aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
|
2368
|
+
ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
|
2369
|
+
} else {
|
2370
|
+
aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
|
2371
|
+
}
|
2372
|
+
|
2373
|
+
ACL_CHECK(aclDestroyTensor(acl_src0));
|
2374
|
+
ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
|
2375
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
2376
|
+
ACL_CHECK(aclDestroyScalar(acl_scale));
|
2377
|
+
ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
|
2378
|
+
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
|
2379
|
+
}
|
2380
|
+
|
2381
|
+
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
2382
|
+
ggml_tensor* src0 = dst->src[0];
|
2383
|
+
ggml_tensor* src1 = dst->src[1];
|
2384
|
+
|
2385
|
+
ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
2386
|
+
ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
2387
|
+
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
2388
|
+
src0->extra = src0_extra_allocator.get();
|
2389
|
+
src1->extra = src1_extra_allocator.get();
|
2390
|
+
dst->extra = dst_extra_allocator.get();
|
2391
|
+
ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
|
2392
|
+
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
2393
|
+
ctx.stream()));
|
2394
|
+
ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
|
2395
|
+
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
2396
|
+
ctx.stream()));
|
2397
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
2398
|
+
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
2399
|
+
ctx.stream()));
|
2400
|
+
|
2401
|
+
switch (src0->type) {
|
2402
|
+
case GGML_TYPE_F32: {
|
2403
|
+
#ifdef ASCEND_310P
|
2404
|
+
// Special operation for get_row_f32 kernel of 310P: clear the
|
2405
|
+
// content of dest data buffer when row is not aligned to 32 bytes
|
2406
|
+
if ((src0->ne[0] % 8) != 0) {
|
2407
|
+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
2408
|
+
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
2409
|
+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
2410
|
+
}
|
2411
|
+
#endif
|
2412
|
+
aclrtlaunch_ascendc_get_row_f32(
|
2413
|
+
24, ctx.stream(), src0->data, src1->data, dst->data,
|
2414
|
+
((ggml_tensor*)src0->extra)->ne,
|
2415
|
+
((ggml_tensor*)src0->extra)->nb,
|
2416
|
+
((ggml_tensor*)src1->extra)->ne,
|
2417
|
+
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
2418
|
+
((ggml_tensor*)dst->extra)->nb);
|
2419
|
+
break;
|
2420
|
+
}
|
2421
|
+
case GGML_TYPE_F16: {
|
2422
|
+
#ifdef ASCEND_310P
|
2423
|
+
// Special operation for get_row_f16 kernel of 310P: clear the
|
2424
|
+
// content of dest data buffer when row is not aligned to 32 bytes
|
2425
|
+
if ((src0->ne[0] % 16) != 0) {
|
2426
|
+
size_t dst_len =
|
2427
|
+
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
|
2428
|
+
ggml_type_size(
|
2429
|
+
GGML_TYPE_F32); // out is also f32, even input is f16
|
2430
|
+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
2431
|
+
}
|
2432
|
+
#endif
|
2433
|
+
aclrtlaunch_ascendc_get_row_f16(
|
2434
|
+
24, ctx.stream(), src0->data, src1->data, dst->data,
|
2435
|
+
((ggml_tensor*)src0->extra)->ne,
|
2436
|
+
((ggml_tensor*)src0->extra)->nb,
|
2437
|
+
((ggml_tensor*)src1->extra)->ne,
|
2438
|
+
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
2439
|
+
((ggml_tensor*)dst->extra)->nb);
|
2440
|
+
break;
|
2441
|
+
}
|
2442
|
+
case GGML_TYPE_Q4_0:
|
2443
|
+
aclrtlaunch_ascendc_get_row_q4_0(
|
2444
|
+
24, ctx.stream(), src0->data, src1->data, dst->data,
|
2445
|
+
((ggml_tensor*)src0->extra)->ne,
|
2446
|
+
((ggml_tensor*)src1->extra)->ne,
|
2447
|
+
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
2448
|
+
((ggml_tensor*)dst->extra)->nb);
|
2449
|
+
break;
|
2450
|
+
case GGML_TYPE_Q8_0:
|
2451
|
+
aclrtlaunch_ascendc_get_row_q8_0(
|
2452
|
+
24, ctx.stream(), src0->data, src1->data, dst->data,
|
2453
|
+
((ggml_tensor*)src0->extra)->ne,
|
2454
|
+
((ggml_tensor*)src1->extra)->ne,
|
2455
|
+
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
2456
|
+
((ggml_tensor*)dst->extra)->nb);
|
2457
|
+
break;
|
2458
|
+
default:
|
2459
|
+
GGML_ABORT("fatal error");
|
2460
|
+
break;
|
2461
|
+
}
|
2462
|
+
}
|
2463
|
+
|
2464
|
+
/**
|
2465
|
+
* @brief Repeats elements of a tensor along a specified dimension.
|
2466
|
+
*
|
2467
|
+
* This function repeats each element of the source tensor `acl_src` a specified
|
2468
|
+
* number of times (`repeats`) along the specified dimension `dim` and stores
|
2469
|
+
* the result in the destination tensor `acl_dst`.
|
2470
|
+
*
|
2471
|
+
* @param ctx The context for the CANN backend operations.
|
2472
|
+
* @param acl_src The source tensor whose elements will be repeated.
|
2473
|
+
* @param acl_dst The destination tensor where the repeated elements will be
|
2474
|
+
* stored.
|
2475
|
+
* @param dim The dimension along which the elements will be repeated.
|
2476
|
+
* @param repeats The number of times each element will be repeated.
|
2477
|
+
* @param output_size The size of the output tensor.
|
2478
|
+
*/
|
2479
|
+
static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
|
2480
|
+
aclTensor* acl_src, aclTensor* acl_dst,
|
2481
|
+
int64_t dim, int64_t repeats,
|
2482
|
+
int64_t output_size) {
|
2483
|
+
uint64_t workspaceSize = 0;
|
2484
|
+
aclOpExecutor* executor;
|
2485
|
+
void* workspaceAddr = nullptr;
|
2486
|
+
|
2487
|
+
ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(
|
2488
|
+
acl_src, repeats, dim, output_size, acl_dst, &workspaceSize,
|
2489
|
+
&executor));
|
2490
|
+
if (workspaceSize > 0) {
|
2491
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2492
|
+
workspaceAddr = workspace_allocator.get();
|
2493
|
+
}
|
2494
|
+
|
2495
|
+
ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize,
|
2496
|
+
executor, ctx.stream()));
|
2497
|
+
}
|
2498
|
+
|
2499
|
+
/**
|
2500
|
+
* @brief Performs matrix multiplication of two tensors.
|
2501
|
+
*
|
2502
|
+
* This function computes the matrix multiplication of the input tensor
|
2503
|
+
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
2504
|
+
* destination tensor `acl_dst`.
|
2505
|
+
* The operation is defined as:
|
2506
|
+
* \f[
|
2507
|
+
* \text {acl_dst}=\text {acl_input@acl_weight}
|
2508
|
+
* \f]
|
2509
|
+
*
|
2510
|
+
* @param ctx The context for the CANN backend operations.
|
2511
|
+
* @param acl_input The input tensor for the matrix multiplication.
|
2512
|
+
* @param acl_weight The weight tensor for the matrix multiplication.
|
2513
|
+
* @param acl_dst The destination tensor where the result of the matrix
|
2514
|
+
* multiplication will be stored.
|
2515
|
+
*/
|
2516
|
+
static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
2517
|
+
aclTensor* acl_weight, aclTensor* acl_dst) {
|
2518
|
+
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
2519
|
+
// fp32, atlas a2 will transpose it to HFLOAT32.
|
2520
|
+
uint64_t workspaceSize = 0;
|
2521
|
+
aclOpExecutor* executor;
|
2522
|
+
void* workspaceAddr = nullptr;
|
2523
|
+
|
2524
|
+
ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
2525
|
+
cube_math_type, &workspaceSize,
|
2526
|
+
&executor));
|
2527
|
+
|
2528
|
+
if (workspaceSize > 0) {
|
2529
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2530
|
+
workspaceAddr = workspace_allocator.get();
|
2531
|
+
}
|
2532
|
+
|
2533
|
+
ACL_CHECK(
|
2534
|
+
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2535
|
+
}
|
2536
|
+
|
2537
|
+
/**
|
2538
|
+
* @brief Performs matrix multiplication of two 2D tensors.
|
2539
|
+
*
|
2540
|
+
* This function computes the matrix multiplication of the input tensor
|
2541
|
+
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
2542
|
+
* destination tensor `acl_dst`.
|
2543
|
+
* The operation is defined as:
|
2544
|
+
* \f[
|
2545
|
+
* \text {acl_dst}=\text {acl_input@acl_weight}
|
2546
|
+
* \f]
|
2547
|
+
*
|
2548
|
+
* @param ctx The context for the CANN backend operations.
|
2549
|
+
* @param acl_input The input tensor for the matrix multiplication.
|
2550
|
+
* @param acl_weight The weight tensor for the matrix multiplication.
|
2551
|
+
* @param acl_dst The destination tensor where the result of the matrix
|
2552
|
+
* multiplication will be stored.
|
2553
|
+
*/
|
2554
|
+
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
|
2555
|
+
aclTensor* acl_input, aclTensor* acl_weight,
|
2556
|
+
aclTensor* acl_dst) {
|
2557
|
+
int8_t cube_math_type = 2;
|
2558
|
+
uint64_t workspaceSize = 0;
|
2559
|
+
aclOpExecutor* executor;
|
2560
|
+
void* workspaceAddr = nullptr;
|
2561
|
+
|
2562
|
+
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
2563
|
+
cube_math_type, &workspaceSize,
|
2564
|
+
&executor));
|
2565
|
+
|
2566
|
+
if (workspaceSize > 0) {
|
2567
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2568
|
+
workspaceAddr = workspace_allocator.get();
|
2569
|
+
}
|
2570
|
+
|
2571
|
+
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2572
|
+
}
|
2573
|
+
|
2574
|
+
/**
|
2575
|
+
* @brief Performs matrix multiplication of two 3D tensors.
|
2576
|
+
*
|
2577
|
+
* This function computes the matrix multiplication of the input tensor
|
2578
|
+
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
2579
|
+
* destination tensor `acl_dst`.
|
2580
|
+
* The operation is defined as:
|
2581
|
+
* \f[
|
2582
|
+
* \text {acl_dst}=\text {acl_input@acl_weight}
|
2583
|
+
* \f]
|
2584
|
+
*
|
2585
|
+
* @param ctx The context for the CANN backend operations.
|
2586
|
+
* @param acl_input The input tensor for the matrix multiplication.
|
2587
|
+
* @param acl_weight The weight tensor for the matrix multiplication.
|
2588
|
+
* @param acl_dst The destination tensor where the result of the matrix
|
2589
|
+
* multiplication will be stored.
|
2590
|
+
*/
|
2591
|
+
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
|
2592
|
+
aclTensor* acl_input, aclTensor* acl_weight,
|
2593
|
+
aclTensor* acl_dst) {
|
2594
|
+
int8_t cube_math_type = 2;
|
2595
|
+
uint64_t workspaceSize = 0;
|
2596
|
+
aclOpExecutor* executor;
|
2597
|
+
void* workspaceAddr = nullptr;
|
2598
|
+
|
2599
|
+
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
2600
|
+
cube_math_type, &workspaceSize,
|
2601
|
+
&executor));
|
2602
|
+
|
2603
|
+
if (workspaceSize > 0) {
|
2604
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2605
|
+
workspaceAddr = workspace_allocator.get();
|
2606
|
+
}
|
2607
|
+
|
2608
|
+
ACL_CHECK(
|
2609
|
+
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2610
|
+
}
|
2611
|
+
|
2612
|
+
/**
|
2613
|
+
* @brief Performs matrix multiplication with floating-point precision on
|
2614
|
+
* tensors using the CANN backend.
|
2615
|
+
*
|
2616
|
+
* This function performs matrix multiplication of the input tensor and the
|
2617
|
+
* weight tensor, handling broadcasting and transposing as needed, and stores
|
2618
|
+
* the result in the destination tensor `dst`.
|
2619
|
+
*
|
2620
|
+
* @param ctx The context for the CANN backend operations.
|
2621
|
+
* @param dst The destination tensor where the result of the matrix
|
2622
|
+
* multiplication will be stored.
|
2623
|
+
*/
|
2624
|
+
static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
2625
|
+
ggml_tensor* dst) {
|
2626
|
+
ggml_tensor* weight = dst->src[0]; // weight
|
2627
|
+
ggml_tensor* input = dst->src[1]; // input
|
2628
|
+
|
2629
|
+
// when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
|
2630
|
+
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
2631
|
+
BCAST_MUL_MAT_SHAPE(input, weight, dst);
|
2632
|
+
|
2633
|
+
int64_t n_dims = bcast_dims;
|
2634
|
+
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
|
2635
|
+
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
|
2636
|
+
n_dims = 2;
|
2637
|
+
} else if (bcast_input_ne[2] == 1) {
|
2638
|
+
n_dims = 3;
|
2639
|
+
}
|
2640
|
+
}
|
2641
|
+
|
2642
|
+
aclTensor* acl_input_tensor =
|
2643
|
+
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
2644
|
+
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
|
2645
|
+
bcast_weight_ne[2], bcast_weight_ne[3],
|
2646
|
+
bcast_weight_ne[4], bcast_weight_ne[5]};
|
2647
|
+
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
2648
|
+
bcast_weight_nb[2], bcast_weight_nb[3],
|
2649
|
+
bcast_weight_nb[4], bcast_weight_nb[5]};
|
2650
|
+
aclTensor* acl_weight_tensor =
|
2651
|
+
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
2652
|
+
aclTensor* acl_dst =
|
2653
|
+
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
2654
|
+
|
2655
|
+
switch (n_dims) {
|
2656
|
+
case 2:
|
2657
|
+
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
2658
|
+
break;
|
2659
|
+
case 3:
|
2660
|
+
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
2661
|
+
break;
|
2662
|
+
default:
|
2663
|
+
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
2664
|
+
break;
|
2665
|
+
}
|
2666
|
+
|
2667
|
+
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
2668
|
+
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
2669
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
2670
|
+
}
|
2671
|
+
|
2672
|
+
/**
|
2673
|
+
* @brief Performs matrix multiplication with quantized weights and
|
2674
|
+
* floating-point inputs using the CANN backend.
|
2675
|
+
*
|
2676
|
+
* This function performs matrix multiplication of the input tensor `src1` and
|
2677
|
+
* the weight tensor `src0`, handling broadcasting, transposing, and
|
2678
|
+
* quantization as needed, and stores the result in the destination tensor
|
2679
|
+
* `dst`.
|
2680
|
+
*
|
2681
|
+
* @param ctx The context for the CANN backend operations.
|
2682
|
+
* @param dst The destination tensor where the result of the matrix
|
2683
|
+
* multiplication will be stored.
|
2684
|
+
*/
|
2685
|
+
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
2686
|
+
ggml_tensor* dst,
|
2687
|
+
const enum ggml_type type) {
|
2688
|
+
ggml_tensor* src0 = dst->src[0]; // weight
|
2689
|
+
ggml_tensor* src1 = dst->src[1]; // input
|
2690
|
+
|
2691
|
+
// The shape of the weight is NCHW.
|
2692
|
+
// Matrix multiplication uses HW dims.
|
2693
|
+
// HC is regarded as batch.
|
2694
|
+
// weight need transpose.
|
2695
|
+
float weight_elem_size;
|
2696
|
+
if (type == GGML_TYPE_Q4_0) {
|
2697
|
+
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
2698
|
+
} else if (type == GGML_TYPE_Q8_0) {
|
2699
|
+
weight_elem_size = float(sizeof(uint8_t));
|
2700
|
+
} else {
|
2701
|
+
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
2702
|
+
}
|
2703
|
+
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
|
2704
|
+
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
|
2705
|
+
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
2706
|
+
|
2707
|
+
// scale stored at the end of weight. Also need transpose.
|
2708
|
+
size_t scale_elem_size = sizeof(uint16_t);
|
2709
|
+
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
2710
|
+
scale_elem_size};
|
2711
|
+
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
2712
|
+
char* scale_offset = (char*)src0->data + weight_size;
|
2713
|
+
|
2714
|
+
// input
|
2715
|
+
size_t input_elem_size = sizeof(uint16_t);
|
2716
|
+
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
2717
|
+
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
2718
|
+
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
2719
|
+
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
2720
|
+
void* input_buffer = src1->data;
|
2721
|
+
|
2722
|
+
// case in
|
2723
|
+
if (src1->type != GGML_TYPE_F16) {
|
2724
|
+
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
2725
|
+
input_buffer =
|
2726
|
+
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
2727
|
+
|
2728
|
+
int64_t* input_cast_ne = src1->ne;
|
2729
|
+
size_t input_cast_nb[GGML_MAX_DIMS];
|
2730
|
+
input_cast_nb[0] = sizeof(uint16_t);
|
2731
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
2732
|
+
input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
|
2733
|
+
}
|
2734
|
+
|
2735
|
+
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
2736
|
+
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
|
2737
|
+
input_cast_nb, GGML_MAX_DIMS);
|
2738
|
+
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
2739
|
+
|
2740
|
+
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
2741
|
+
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
|
2742
|
+
}
|
2743
|
+
|
2744
|
+
// output
|
2745
|
+
size_t output_elem_size = sizeof(uint16_t);
|
2746
|
+
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
2747
|
+
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
2748
|
+
void* output_buffer =
|
2749
|
+
output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
2750
|
+
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
2751
|
+
|
2752
|
+
// aclnn
|
2753
|
+
int64_t max_elem_size = 65535;
|
2754
|
+
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
|
2755
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
2756
|
+
aclOpExecutor* executor = nullptr;
|
2757
|
+
uint64_t workspaceSize = 0;
|
2758
|
+
void* workspaceAddr = nullptr;
|
2759
|
+
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
2760
|
+
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
2761
|
+
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
|
2762
|
+
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
|
2763
|
+
|
2764
|
+
int64_t batch1 = (n1 * src1->ne[2]) + c1;
|
2765
|
+
int64_t batch0 = (n0 * src0->ne[2]) + c0;
|
2766
|
+
|
2767
|
+
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
2768
|
+
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
2769
|
+
input_elem_size, input_ne, input_nb, 2);
|
2770
|
+
|
2771
|
+
// first split
|
2772
|
+
int64_t weight_ne_offset = 0;
|
2773
|
+
int64_t weight_ne[2] = {
|
2774
|
+
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
|
2775
|
+
src0->ne[0]};
|
2776
|
+
int64_t scale_ne_offset = 0;
|
2777
|
+
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
2778
|
+
int64_t output_ne_offset = 0;
|
2779
|
+
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
|
2780
|
+
|
2781
|
+
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
2782
|
+
(char*)src0->data + batch0 * weight_stride,
|
2783
|
+
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
2784
|
+
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
2785
|
+
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
2786
|
+
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
2787
|
+
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
|
2788
|
+
scale_ne_offset);
|
2789
|
+
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
2790
|
+
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
2791
|
+
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
2792
|
+
output_ne_offset);
|
2793
|
+
|
2794
|
+
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
2795
|
+
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
2796
|
+
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
2797
|
+
&workspaceSize, &executor));
|
2798
|
+
if (workspaceAddr == nullptr) {
|
2799
|
+
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
2800
|
+
}
|
2801
|
+
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
2802
|
+
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2803
|
+
|
2804
|
+
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
2805
|
+
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
2806
|
+
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
2807
|
+
|
2808
|
+
// other splits
|
2809
|
+
for (int64_t split = 1; split < split_size; split++) {
|
2810
|
+
weight_ne_offset +=
|
2811
|
+
weight_elem_size * weight_ne[0] * weight_ne[1];
|
2812
|
+
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
|
2813
|
+
? src0->ne[1] - (max_elem_size * split)
|
2814
|
+
: max_elem_size;
|
2815
|
+
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
2816
|
+
scale_ne[0] = weight_ne[0];
|
2817
|
+
output_ne_offset +=
|
2818
|
+
output_elem_size * output_ne[0] * output_ne[1];
|
2819
|
+
output_ne[0] = weight_ne[0];
|
2820
|
+
|
2821
|
+
acl_weight_tensor = ggml_cann_create_tensor(
|
2822
|
+
(char*)src0->data + batch0 * weight_stride,
|
2823
|
+
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
2824
|
+
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
2825
|
+
acl_scale_tensor = ggml_cann_create_tensor(
|
2826
|
+
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
2827
|
+
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
|
2828
|
+
scale_ne_offset);
|
2829
|
+
acl_output_tensor = ggml_cann_create_tensor(
|
2830
|
+
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
2831
|
+
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
2832
|
+
output_ne_offset);
|
2833
|
+
|
2834
|
+
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
2835
|
+
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
2836
|
+
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
2837
|
+
acl_output_tensor, &workspaceSize, &executor));
|
2838
|
+
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
2839
|
+
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2840
|
+
|
2841
|
+
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
2842
|
+
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
2843
|
+
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
2844
|
+
}
|
2845
|
+
|
2846
|
+
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
2847
|
+
}
|
2848
|
+
}
|
2849
|
+
|
2850
|
+
// cast out
|
2851
|
+
if (dst->type != GGML_TYPE_F16) {
|
2852
|
+
int64_t* output_cast_ne = dst->ne;
|
2853
|
+
size_t output_cast_nb[GGML_MAX_DIMS];
|
2854
|
+
output_cast_nb[0] = sizeof(uint16_t);
|
2855
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
2856
|
+
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
2857
|
+
}
|
2858
|
+
|
2859
|
+
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
2860
|
+
output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
|
2861
|
+
output_cast_nb, GGML_MAX_DIMS);
|
2862
|
+
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
2863
|
+
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
|
2864
|
+
ggml_cann_type_mapping(dst->type));
|
2865
|
+
|
2866
|
+
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
2867
|
+
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
2868
|
+
}
|
2869
|
+
}
|
2870
|
+
|
2871
|
+
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
2872
|
+
const enum ggml_type type = dst->src[0]->type;
|
2873
|
+
switch (type) {
|
2874
|
+
case GGML_TYPE_F32:
|
2875
|
+
case GGML_TYPE_F16:
|
2876
|
+
ggml_cann_mat_mul_fp(ctx, dst);
|
2877
|
+
break;
|
2878
|
+
case GGML_TYPE_Q4_0:
|
2879
|
+
case GGML_TYPE_Q8_0:
|
2880
|
+
ggml_cann_mul_mat_quant(ctx, dst, type);
|
2881
|
+
break;
|
2882
|
+
default:
|
2883
|
+
GGML_ABORT("fatal error");
|
2884
|
+
break;
|
2885
|
+
}
|
2886
|
+
}
|
2887
|
+
|
2888
|
+
/**
|
2889
|
+
* @brief Rolls the elements of a tensor along a specified dimension.
|
2890
|
+
*
|
2891
|
+
* This function rolls the elements of the source tensor `acl_src` by the
|
2892
|
+
* specified shifts `shifts` along the specified dimensions `dims`, and stores
|
2893
|
+
* the result in the destination tensor `acl_dst`.
|
2894
|
+
*
|
2895
|
+
* @param ctx The context for the CANN backend operations.
|
2896
|
+
* @param acl_src The source tensor whose elements will be rolled.
|
2897
|
+
* @param acl_dst The destination tensor where the rolled elements will be
|
2898
|
+
* stored.
|
2899
|
+
* @param shifts An array specifying the number of positions by which elements
|
2900
|
+
* are shifted.
|
2901
|
+
* @param dims An array specifying the dimensions along which elements are
|
2902
|
+
* shifted.
|
2903
|
+
*/
|
2904
|
+
static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
2905
|
+
aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
|
2906
|
+
aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
|
2907
|
+
aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
|
2908
|
+
|
2909
|
+
uint64_t workspaceSize = 0;
|
2910
|
+
aclOpExecutor* executor;
|
2911
|
+
void* workspaceAddr = nullptr;
|
2912
|
+
|
2913
|
+
ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst,
|
2914
|
+
&workspaceSize, &executor));
|
2915
|
+
if (workspaceSize > 0) {
|
2916
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2917
|
+
workspaceAddr = workspace_allocator.get();
|
2918
|
+
}
|
2919
|
+
|
2920
|
+
ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
2921
|
+
|
2922
|
+
ACL_CHECK(aclDestroyIntArray(acl_shifts));
|
2923
|
+
ACL_CHECK(aclDestroyIntArray(acl_dims));
|
2924
|
+
}
|
2925
|
+
|
2926
|
+
/**
|
2927
|
+
* @brief Fills specified positions of a tensor with a scalar value.
|
2928
|
+
*
|
2929
|
+
* This function fills the positions in the source tensor `acl_src` specified by
|
2930
|
+
* `index` along the dimension `dim` with the scalar value `value`.
|
2931
|
+
*
|
2932
|
+
* @param ctx The context for the CANN backend operations.
|
2933
|
+
* @param acl_src The source tensor where the positions will be filled.
|
2934
|
+
* @param dim The dimension along which the positions are specified.
|
2935
|
+
* @param index An array specifying the positions to be filled.
|
2936
|
+
* @param index_num The number of positions specified in the index array.
|
2937
|
+
* @param value The scalar value used to fill the specified positions.
|
2938
|
+
*/
|
2939
|
+
static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
|
2940
|
+
aclTensor* acl_src, int64_t dim,
|
2941
|
+
int64_t* index, int64_t index_num,
|
2942
|
+
float value) {
|
2943
|
+
aclIntArray* acl_index = aclCreateIntArray(index, index_num);
|
2944
|
+
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
|
2945
|
+
|
2946
|
+
uint64_t workspaceSize = 0;
|
2947
|
+
aclOpExecutor* executor;
|
2948
|
+
void* workspaceAddr = nullptr;
|
2949
|
+
|
2950
|
+
ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(
|
2951
|
+
acl_src, dim, acl_index, acl_value, &workspaceSize, &executor));
|
2952
|
+
if (workspaceSize > 0) {
|
2953
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
2954
|
+
workspaceAddr = workspace_allocator.get();
|
2955
|
+
}
|
2956
|
+
|
2957
|
+
ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize,
|
2958
|
+
executor, ctx.stream()));
|
2959
|
+
|
2960
|
+
ACL_CHECK(aclDestroyIntArray(acl_index));
|
2961
|
+
ACL_CHECK(aclDestroyScalar(acl_value));
|
2962
|
+
}
|
2963
|
+
|
2964
|
+
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
2965
|
+
aclTensor* acl_cos_repeat_tensor,
|
2966
|
+
aclTensor* acl_sin_repeat_tensor,
|
2967
|
+
float theta_scale, float freq_scale,
|
2968
|
+
float attn_factor, bool is_neox) {
|
2969
|
+
// int sin/cos cache, cache has different repeat method depond on
|
2970
|
+
// @param.is_neox
|
2971
|
+
|
2972
|
+
ggml_tensor* src0 = dst->src[0]; // input
|
2973
|
+
ggml_tensor* src1 = dst->src[1]; // position
|
2974
|
+
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
2975
|
+
|
2976
|
+
// arange, [0,1,...,ne0/2]
|
2977
|
+
int64_t arange_length = src0->ne[0] / 2;
|
2978
|
+
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
2979
|
+
arange_length * sizeof(float_t));
|
2980
|
+
void* arange_buffer = arange_allocator.get();
|
2981
|
+
int64_t arange_ne[] = {arange_length, 1, 1, 1};
|
2982
|
+
size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
|
2983
|
+
arange_length * sizeof(float_t)};
|
2984
|
+
|
2985
|
+
aclTensor* acl_arange_tensor =
|
2986
|
+
ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t),
|
2987
|
+
arange_ne, arange_nb, GGML_MAX_DIMS);
|
2988
|
+
float start = 0;
|
2989
|
+
float step = 1;
|
2990
|
+
float stop = src0->ne[0] / 2;
|
2991
|
+
float n_elements = src0->ne[0] / 2;
|
2992
|
+
aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements);
|
2993
|
+
|
2994
|
+
// power
|
2995
|
+
// aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
|
2996
|
+
// use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
|
2997
|
+
// aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
|
2998
|
+
// aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
|
2999
|
+
// acl_power_tensor);
|
3000
|
+
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
|
3001
|
+
arange_length * sizeof(float_t));
|
3002
|
+
void* theta_scale_buffer = theta_scale_allocator.get();
|
3003
|
+
aclTensor* acl_theta_scale_tensor = aclnn_values(
|
3004
|
+
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
|
3005
|
+
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
|
3006
|
+
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
|
3007
|
+
|
3008
|
+
// freq_scale
|
3009
|
+
if (freq_scale != 1) {
|
3010
|
+
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
|
3011
|
+
}
|
3012
|
+
|
3013
|
+
// freq_factors
|
3014
|
+
if (src2) {
|
3015
|
+
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
|
3016
|
+
src2->data, ggml_cann_type_mapping(src2->type),
|
3017
|
+
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
|
3018
|
+
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
|
3019
|
+
nullptr, true);
|
3020
|
+
ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
|
3021
|
+
}
|
3022
|
+
|
3023
|
+
// position
|
3024
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
3025
|
+
int64_t position_length = src1->ne[0];
|
3026
|
+
int64_t position_ne[] = {1, position_length, 1, 1};
|
3027
|
+
size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t),
|
3028
|
+
sizeof(int32_t) * position_length,
|
3029
|
+
sizeof(int32_t) * position_length};
|
3030
|
+
aclTensor* acl_position_tensor = ggml_cann_create_tensor(
|
3031
|
+
src1->data, ggml_cann_type_mapping(src1->type),
|
3032
|
+
ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
|
3033
|
+
|
3034
|
+
// power * position
|
3035
|
+
int64_t theta_length = arange_length * position_length;
|
3036
|
+
ggml_cann_pool_alloc theta_allocator(ctx.pool(),
|
3037
|
+
theta_length * sizeof(float_t));
|
3038
|
+
void* theta_buffer = theta_allocator.get();
|
3039
|
+
int64_t theta_ne[] = {arange_length, position_length, 1, 1};
|
3040
|
+
size_t theta_nb[GGML_MAX_DIMS];
|
3041
|
+
theta_nb[0] = sizeof(float_t);
|
3042
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3043
|
+
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
|
3044
|
+
}
|
3045
|
+
aclTensor* acl_theta_tensor =
|
3046
|
+
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
|
3047
|
+
theta_ne, theta_nb, GGML_MAX_DIMS);
|
3048
|
+
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
|
3049
|
+
acl_theta_tensor);
|
3050
|
+
|
3051
|
+
// permute: [0,1,2,3]->[0,2,1,3]
|
3052
|
+
int64_t permute_ne[] = {arange_length, 1, position_length, 1};
|
3053
|
+
size_t permute_nb[GGML_MAX_DIMS];
|
3054
|
+
permute_nb[0] = sizeof(float_t);
|
3055
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3056
|
+
permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1];
|
3057
|
+
}
|
3058
|
+
ggml_cann_pool_alloc permute_allocator(ctx.pool(),
|
3059
|
+
theta_length * sizeof(float_t));
|
3060
|
+
void* permute_buffer = permute_allocator.get();
|
3061
|
+
aclTensor* acl_permute_tensor = ggml_cann_create_tensor(
|
3062
|
+
permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
|
3063
|
+
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
3064
|
+
int64_t permute_dim[] = {0, 2, 1, 3};
|
3065
|
+
int64_t num_dims = 4;
|
3066
|
+
aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
|
3067
|
+
num_dims);
|
3068
|
+
|
3069
|
+
// sin/cos
|
3070
|
+
ggml_cann_pool_alloc sin_allocator(ctx.pool(),
|
3071
|
+
theta_length * sizeof(float_t));
|
3072
|
+
void* sin_buffer = sin_allocator.get();
|
3073
|
+
aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
|
3074
|
+
sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
|
3075
|
+
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
3076
|
+
aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor);
|
3077
|
+
|
3078
|
+
ggml_cann_pool_alloc cos_allocator(ctx.pool(),
|
3079
|
+
theta_length * sizeof(float_t));
|
3080
|
+
void* cos_buffer = cos_allocator.get();
|
3081
|
+
aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
|
3082
|
+
cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
|
3083
|
+
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
3084
|
+
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
|
3085
|
+
|
3086
|
+
// attn_factor
|
3087
|
+
if (attn_factor != 1) {
|
3088
|
+
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
|
3089
|
+
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
|
3090
|
+
}
|
3091
|
+
|
3092
|
+
// repeat
|
3093
|
+
if (is_neox) {
|
3094
|
+
int64_t repeatsArray[] = {1, 1, 1, 2};
|
3095
|
+
aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
|
3096
|
+
aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
|
3097
|
+
} else {
|
3098
|
+
int64_t num_repeats = 2;
|
3099
|
+
int64_t dim = 3;
|
3100
|
+
int64_t output_size = arange_length * num_repeats;
|
3101
|
+
aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
|
3102
|
+
num_repeats, output_size);
|
3103
|
+
aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
|
3104
|
+
num_repeats, output_size);
|
3105
|
+
}
|
3106
|
+
|
3107
|
+
// release
|
3108
|
+
ACL_CHECK(aclDestroyTensor(acl_arange_tensor));
|
3109
|
+
ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor));
|
3110
|
+
ACL_CHECK(aclDestroyTensor(acl_position_tensor));
|
3111
|
+
ACL_CHECK(aclDestroyTensor(acl_theta_tensor));
|
3112
|
+
ACL_CHECK(aclDestroyTensor(acl_permute_tensor));
|
3113
|
+
ACL_CHECK(aclDestroyTensor(acl_sin_tensor));
|
3114
|
+
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
|
3115
|
+
}
|
3116
|
+
|
3117
|
+
#ifdef __cplusplus
|
3118
|
+
extern "C" {
|
3119
|
+
#endif
|
3120
|
+
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
3121
|
+
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
|
3122
|
+
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
|
3123
|
+
aclOpExecutor** executor);
|
3124
|
+
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
|
3125
|
+
uint64_t workspaceSize,
|
3126
|
+
aclOpExecutor* executor,
|
3127
|
+
aclrtStream stream);
|
3128
|
+
#ifdef __cplusplus
|
3129
|
+
}
|
3130
|
+
#endif
|
3131
|
+
|
3132
|
+
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
3133
|
+
// TODO: use ascendc
|
3134
|
+
// Only test with LLAMA model.
|
3135
|
+
ggml_tensor* src0 = dst->src[0]; // input
|
3136
|
+
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
3137
|
+
|
3138
|
+
// param
|
3139
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
3140
|
+
// const int n_past = ((int32_t *) dst->op_params)[0];
|
3141
|
+
const int n_dims = ((int32_t*)dst->op_params)[1];
|
3142
|
+
const int mode = ((int32_t*)dst->op_params)[2];
|
3143
|
+
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
3144
|
+
const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
|
3145
|
+
|
3146
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
3147
|
+
|
3148
|
+
memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
|
3149
|
+
memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
|
3150
|
+
memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
|
3151
|
+
memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
|
3152
|
+
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
3153
|
+
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
3154
|
+
|
3155
|
+
// TODO: n_dims <= ne0
|
3156
|
+
GGML_ASSERT(n_dims == ne0);
|
3157
|
+
GGML_ASSERT(n_dims % 2 == 0);
|
3158
|
+
// TODO: ext_factor != 0
|
3159
|
+
GGML_ASSERT(ext_factor == 0);
|
3160
|
+
|
3161
|
+
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
3162
|
+
|
3163
|
+
float corr_dims[2];
|
3164
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
|
3165
|
+
beta_slow, corr_dims);
|
3166
|
+
|
3167
|
+
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
3168
|
+
|
3169
|
+
// init cos/sin cache
|
3170
|
+
ggml_cann_pool_alloc sin_allocator(
|
3171
|
+
ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
|
3172
|
+
ggml_cann_pool_alloc cos_allocator(
|
3173
|
+
ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
|
3174
|
+
void* sin_buffer = sin_allocator.get();
|
3175
|
+
void* cos_buffer = cos_allocator.get();
|
3176
|
+
|
3177
|
+
int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
|
3178
|
+
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
3179
|
+
sin_reshape_nb[0] = sizeof(float_t);
|
3180
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3181
|
+
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
|
3182
|
+
}
|
3183
|
+
aclTensor* acl_sin_reshape_tensor =
|
3184
|
+
ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t),
|
3185
|
+
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
3186
|
+
aclTensor* acl_cos_reshape_tensor =
|
3187
|
+
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
|
3188
|
+
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
3189
|
+
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
3190
|
+
theta_scale, freq_scale, attn_factor, is_neox);
|
3191
|
+
|
3192
|
+
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
3193
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
3194
|
+
|
3195
|
+
#ifdef ASCEND_310P
|
3196
|
+
// Special ROPE operation for 310P
|
3197
|
+
|
3198
|
+
// roll input
|
3199
|
+
void* input_roll_buffer;
|
3200
|
+
aclTensor* acl_minus_one_tensor;
|
3201
|
+
void* minus_one_scale_buffer = nullptr;
|
3202
|
+
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
3203
|
+
ggml_cann_pool_alloc minus_one_scale_allocator(
|
3204
|
+
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
3205
|
+
if (!is_neox) {
|
3206
|
+
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
3207
|
+
input_roll_buffer = roll_allocator.get();
|
3208
|
+
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
|
3209
|
+
src0->ne[2], src0->ne[3]};
|
3210
|
+
size_t input_roll_nb[GGML_MAX_DIMS];
|
3211
|
+
input_roll_nb[0] = ggml_type_size(src0->type);
|
3212
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3213
|
+
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
|
3214
|
+
}
|
3215
|
+
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
3216
|
+
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
3217
|
+
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
3218
|
+
GGML_MAX_DIMS);
|
3219
|
+
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
3220
|
+
src0->data, ggml_cann_type_mapping(src0->type),
|
3221
|
+
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
3222
|
+
GGML_MAX_DIMS);
|
3223
|
+
|
3224
|
+
int64_t shifts[] = {1};
|
3225
|
+
int64_t dims[] = {3};
|
3226
|
+
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
3227
|
+
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
3228
|
+
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
3229
|
+
|
3230
|
+
// init [-1, 1, -1, 1, ...]
|
3231
|
+
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
3232
|
+
|
3233
|
+
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
3234
|
+
size_t minus_one_nb[GGML_MAX_DIMS];
|
3235
|
+
minus_one_nb[0] = sizeof(float_t);
|
3236
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3237
|
+
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
3238
|
+
}
|
3239
|
+
acl_minus_one_tensor = aclnn_values(
|
3240
|
+
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
3241
|
+
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
3242
|
+
int64_t dim = 3;
|
3243
|
+
int64_t* index = new int64_t[src0->ne[0]];
|
3244
|
+
for (int i = 0; i < src0->ne[0]; i++) {
|
3245
|
+
index[i] = i / 2 * 2;
|
3246
|
+
}
|
3247
|
+
int64_t index_num = src0->ne[0];
|
3248
|
+
float value = -1;
|
3249
|
+
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
|
3250
|
+
index_num, value);
|
3251
|
+
} else {
|
3252
|
+
// roll input: [q0,q1,q2,...] ->
|
3253
|
+
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
|
3254
|
+
input_roll_buffer = roll_allocator.get();
|
3255
|
+
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
3256
|
+
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
3257
|
+
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
|
3258
|
+
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
|
3259
|
+
|
3260
|
+
int64_t shifts[] = {src0->ne[0] / 2};
|
3261
|
+
int64_t dims[] = {3};
|
3262
|
+
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
3263
|
+
|
3264
|
+
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
3265
|
+
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
3266
|
+
// init [-1, -1, -1, 1, 1,1,...]
|
3267
|
+
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
3268
|
+
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
3269
|
+
size_t minus_one_nb[GGML_MAX_DIMS];
|
3270
|
+
minus_one_nb[0] = sizeof(float_t);
|
3271
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3272
|
+
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
3273
|
+
}
|
3274
|
+
acl_minus_one_tensor = aclnn_values(
|
3275
|
+
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
3276
|
+
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
3277
|
+
// -1 * first half
|
3278
|
+
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
3279
|
+
size_t first_half_nb[GGML_MAX_DIMS];
|
3280
|
+
first_half_nb[0] = sizeof(float_t);
|
3281
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3282
|
+
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
3283
|
+
}
|
3284
|
+
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
|
3285
|
+
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
|
3286
|
+
first_half_nb, GGML_MAX_DIMS);
|
3287
|
+
bool inplace = true;
|
3288
|
+
float scale = -1;
|
3289
|
+
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
|
3290
|
+
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
|
3291
|
+
}
|
3292
|
+
|
3293
|
+
// TODO: n_dims < ne0
|
3294
|
+
GGML_ASSERT(n_dims == src0->ne[0]);
|
3295
|
+
|
3296
|
+
// input * scale
|
3297
|
+
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
|
3298
|
+
ggml_nbytes(src0));
|
3299
|
+
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
|
3300
|
+
size_t input_nb[GGML_MAX_DIMS];
|
3301
|
+
input_nb[0] = ggml_type_size(src0->type);
|
3302
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3303
|
+
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
|
3304
|
+
}
|
3305
|
+
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
|
3306
|
+
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
3307
|
+
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
3308
|
+
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
|
3309
|
+
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
3310
|
+
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
3311
|
+
|
3312
|
+
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
|
3313
|
+
acl_input_roll_mul_scale_tensor);
|
3314
|
+
|
3315
|
+
// output
|
3316
|
+
void* output_fp32_buffer;
|
3317
|
+
if (src0->type == GGML_TYPE_F32) {
|
3318
|
+
aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
|
3319
|
+
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
|
3320
|
+
acl_sin_reshape_tensor);
|
3321
|
+
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
|
3322
|
+
// TODO: ne0 != n_dims in mode2
|
3323
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
3324
|
+
size_t input_fp32_nb[GGML_MAX_DIMS];
|
3325
|
+
input_fp32_nb[0] = sizeof(float_t);
|
3326
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3327
|
+
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
3328
|
+
}
|
3329
|
+
ggml_cann_pool_alloc fp32_allocator1(
|
3330
|
+
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
3331
|
+
void* input_fp32_buffer1 = fp32_allocator1.get();
|
3332
|
+
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
|
3333
|
+
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
|
3334
|
+
input_fp32_nb, GGML_MAX_DIMS);
|
3335
|
+
ggml_cann_pool_alloc fp32_allocator2(
|
3336
|
+
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
3337
|
+
void* input_fp32_buffer2 = fp32_allocator2.get();
|
3338
|
+
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
|
3339
|
+
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
|
3340
|
+
input_fp32_nb, GGML_MAX_DIMS);
|
3341
|
+
|
3342
|
+
ggml_cann_pool_alloc fp32_allocator(
|
3343
|
+
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
3344
|
+
output_fp32_buffer = fp32_allocator.get();
|
3345
|
+
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
|
3346
|
+
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
3347
|
+
input_fp32_nb, GGML_MAX_DIMS);
|
3348
|
+
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
|
3349
|
+
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
3350
|
+
input_fp32_tensor2);
|
3351
|
+
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
3352
|
+
output_fp32_tensor);
|
3353
|
+
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
|
3354
|
+
|
3355
|
+
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
|
3356
|
+
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
|
3357
|
+
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
|
3358
|
+
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
3359
|
+
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
|
3360
|
+
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
|
3361
|
+
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
|
3362
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
3363
|
+
}
|
3364
|
+
return;
|
3365
|
+
#endif
|
3366
|
+
|
3367
|
+
// src0 == GGML_TYPE_F16
|
3368
|
+
// TODO: optimization this `if` code
|
3369
|
+
if (src0->type == GGML_TYPE_F16) {
|
3370
|
+
ggml_cann_pool_alloc sin_final_allocator(
|
3371
|
+
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
|
3372
|
+
ggml_cann_pool_alloc cos_final_allocator(
|
3373
|
+
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
|
3374
|
+
void* sin_final_buffer = sin_final_allocator.get();
|
3375
|
+
void* cos_final_buffer = cos_final_allocator.get();
|
3376
|
+
|
3377
|
+
int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
|
3378
|
+
size_t sin_final_nb[GGML_MAX_DIMS];
|
3379
|
+
sin_final_nb[0] = ggml_type_size(src0->type);
|
3380
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3381
|
+
sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
|
3382
|
+
}
|
3383
|
+
aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
|
3384
|
+
sin_final_buffer, ggml_cann_type_mapping(src0->type),
|
3385
|
+
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
|
3386
|
+
GGML_MAX_DIMS);
|
3387
|
+
aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
|
3388
|
+
cos_final_buffer, ggml_cann_type_mapping(src0->type),
|
3389
|
+
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
|
3390
|
+
GGML_MAX_DIMS);
|
3391
|
+
|
3392
|
+
aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
|
3393
|
+
ggml_cann_type_mapping(src0->type));
|
3394
|
+
aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
|
3395
|
+
ggml_cann_type_mapping(src0->type));
|
3396
|
+
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
3397
|
+
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
3398
|
+
acl_sin_reshape_tensor = acl_sin_final_tensor;
|
3399
|
+
acl_cos_reshape_tensor = acl_cos_final_tensor;
|
3400
|
+
}
|
3401
|
+
|
3402
|
+
uint64_t workspaceSize = 0;
|
3403
|
+
aclOpExecutor* executor;
|
3404
|
+
|
3405
|
+
void* workspaceAddr = nullptr;
|
3406
|
+
|
3407
|
+
int acl_mode = mode;
|
3408
|
+
if (mode == 0) {
|
3409
|
+
acl_mode = 1;
|
3410
|
+
}
|
3411
|
+
|
3412
|
+
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
3413
|
+
acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
|
3414
|
+
acl_dst, &workspaceSize, &executor));
|
3415
|
+
if (workspaceSize > 0) {
|
3416
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
3417
|
+
workspaceAddr = workspace_allocator.get();
|
3418
|
+
}
|
3419
|
+
|
3420
|
+
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
|
3421
|
+
executor, ctx.stream()));
|
3422
|
+
|
3423
|
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
3424
|
+
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
3425
|
+
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
3426
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
3427
|
+
}
|