whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,19 @@
1
+ #ifndef ASCENDC_KERNELS_H
2
+ #define ASCENDC_KERNELS_H
3
+
4
+ #include "aclrtlaunch_ascendc_get_row_f32.h"
5
+ #include "aclrtlaunch_ascendc_get_row_f16.h"
6
+ #include "aclrtlaunch_ascendc_get_row_q8_0.h"
7
+ #include "aclrtlaunch_ascendc_get_row_q4_0.h"
8
+
9
+ #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10
+ #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11
+ #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12
+ #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
13
+
14
+ #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
15
+ #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
16
+ #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
17
+ #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
18
+
19
+ #endif // ASCENDC_KERNELS_H
@@ -0,0 +1,236 @@
1
+ #include "kernel_operator.h"
2
+
3
+ #include <cmath>
4
+
5
+ using namespace AscendC;
6
+
7
+ #define BUFFER_NUM 2
8
+ const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
9
+
10
+ template <typename SRC_T, typename DST_T>
11
+ class DupByRows {
12
+ public:
13
+ __aicore__ inline DupByRows() {}
14
+ __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
15
+ size_t *input_nb_ub) {
16
+ /* Dup by rows when src is contigous on first dimension and dst is
17
+ contiguous, each kernel process one row.
18
+ */
19
+
20
+ // Input has four dims.
21
+ int64_t op_block_num = GetBlockNum();
22
+ int64_t op_block_idx = GetBlockIdx();
23
+
24
+ // param
25
+ num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
26
+ num_elem = input_ne_ub[0];
27
+
28
+ // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
29
+ idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
30
+ idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
31
+ / (input_ne_ub[1]);
32
+ idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
33
+ - idx_ne2 * input_ne_ub[1];
34
+
35
+ // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
36
+ src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
37
+ + input_nb_ub[1] * idx_ne1;
38
+
39
+ // dst is contiguous
40
+ dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
41
+
42
+ src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
43
+ src_stride));
44
+ dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
45
+ dst_stride));
46
+
47
+ pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
48
+ 32 - 1) / 32 * 32);
49
+ pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
50
+ 32 - 1) / 32 * 32);
51
+ }
52
+
53
+ __aicore__ inline void copy_in() {
54
+ LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
55
+ const size_t elem_per_block = 32 / sizeof(SRC_T);
56
+ size_t tail = num_elem % elem_per_block;
57
+ size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
58
+ DataCopy(src_local, src_gm, cpy_elements_len);
59
+ src_queue.EnQue(src_local);
60
+ }
61
+
62
+ __aicore__ inline void copy_out() {
63
+ LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
64
+ #ifdef ASCEND_310P
65
+ const size_t elem_per_block = 32 / sizeof(DST_T);
66
+ size_t tail = num_elem % elem_per_block;
67
+ size_t len = num_elem & ~(elem_per_block - 1);
68
+ if (len > 0) {
69
+ DataCopy(dst_gm, dst_local, len);
70
+ }
71
+ if(tail != 0) {
72
+ for (size_t i = tail; i < elem_per_block; i++) {
73
+ dst_local[len + i].SetValue(0, 0);
74
+ }
75
+ SetAtomicAdd<float>();
76
+ DataCopy(dst_gm[len], dst_local[len], elem_per_block);
77
+ SetAtomicNone();
78
+ }
79
+ #else
80
+ DataCopyExtParams dataCopyParams;
81
+ dataCopyParams.blockCount = 1;
82
+ dataCopyParams.blockLen = num_elem * sizeof(DST_T);
83
+ DataCopyPad(dst_gm, dst_local, dataCopyParams);
84
+ #endif
85
+ dst_queue.FreeTensor(dst_local);
86
+ }
87
+
88
+ __aicore__ inline void dup() {
89
+ // main process, copy one row data from src to dst.
90
+ copy_in();
91
+
92
+ LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
93
+ LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
94
+
95
+ int32_t BLOCK_NUM = 32 / sizeof(DST_T);
96
+ DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
97
+ / BLOCK_NUM * BLOCK_NUM);
98
+ dst_queue.EnQue<DST_T>(dst_local);
99
+
100
+ src_queue.FreeTensor(src_local);
101
+ copy_out();
102
+ }
103
+
104
+ __aicore__ inline void dup_with_cast() {
105
+ // main process, copy one row data from src to dst.
106
+ // cast dtype from src to dst.
107
+ copy_in();
108
+
109
+ LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
110
+ LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
111
+
112
+ Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
113
+ dst_queue.EnQue<DST_T>(dst_local);
114
+
115
+ src_queue.FreeTensor(src_local);
116
+ copy_out();
117
+ }
118
+
119
+ private:
120
+
121
+ TPipe pipe;
122
+ GlobalTensor<SRC_T> src_gm;
123
+ GlobalTensor<DST_T> dst_gm;
124
+
125
+ int64_t num_rows;
126
+ int64_t num_elem;
127
+ int64_t idx_ne3;
128
+ int64_t idx_ne2;
129
+ int64_t idx_ne1;
130
+ int64_t src_stride;
131
+ int64_t dst_stride;
132
+
133
+ TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
134
+ TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
135
+ };
136
+
137
+ template <typename T>
138
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
139
+ auto gm_ptr = (__gm__ uint8_t *)gm;
140
+ auto ub_ptr = (uint8_t *)(ub);
141
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
142
+ *ub_ptr = *gm_ptr;
143
+ }
144
+ }
145
+
146
+ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
147
+ GM_ADDR src_gm,
148
+ GM_ADDR dst_gm,
149
+ GM_ADDR input_ne_gm,
150
+ GM_ADDR input_nb_gm,
151
+ GM_ADDR output_ne_gm,
152
+ GM_ADDR output_nb_gm) {
153
+
154
+ int64_t input_ne_ub[4];
155
+ size_t input_nb_ub[4];
156
+ int64_t output_ne_ub[4];
157
+ size_t output_nb_ub[4];
158
+
159
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
160
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
161
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
162
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
163
+
164
+ DupByRows<half, half> op;
165
+ op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
166
+ op.dup();
167
+ }
168
+
169
+ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
170
+ GM_ADDR src_gm,
171
+ GM_ADDR dst_gm,
172
+ GM_ADDR input_ne_gm,
173
+ GM_ADDR input_nb_gm,
174
+ GM_ADDR output_ne_gm,
175
+ GM_ADDR output_nb_gm) {
176
+ int64_t input_ne_ub[4];
177
+ size_t input_nb_ub[4];
178
+ int64_t output_ne_ub[4];
179
+ size_t output_nb_ub[4];
180
+
181
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
182
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
183
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
184
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
185
+
186
+ DupByRows<float_t, float_t> op;
187
+ op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
188
+ op.dup();
189
+ }
190
+
191
+ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
192
+ GM_ADDR src_gm,
193
+ GM_ADDR dst_gm,
194
+ GM_ADDR input_ne_gm,
195
+ GM_ADDR input_nb_gm,
196
+ GM_ADDR output_ne_gm,
197
+ GM_ADDR output_nb_gm) {
198
+
199
+ int64_t input_ne_ub[4];
200
+ size_t input_nb_ub[4];
201
+ int64_t output_ne_ub[4];
202
+ size_t output_nb_ub[4];
203
+
204
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
205
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
206
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
207
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
208
+
209
+ DupByRows<float_t, half> op;
210
+ op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
211
+ op.dup_with_cast();
212
+ }
213
+
214
+ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
215
+ GM_ADDR src_gm,
216
+ GM_ADDR dst_gm,
217
+ GM_ADDR input_ne_gm,
218
+ GM_ADDR input_nb_gm,
219
+ GM_ADDR output_ne_gm,
220
+ GM_ADDR output_nb_gm) {
221
+
222
+ // copy params from gm to ub.
223
+ int64_t input_ne_ub[4];
224
+ size_t input_nb_ub[4];
225
+ int64_t output_ne_ub[4];
226
+ size_t output_nb_ub[4];
227
+
228
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
229
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
230
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
231
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
232
+
233
+ DupByRows<half, float_t> op;
234
+ op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
235
+ op.dup_with_cast();
236
+ }
@@ -0,0 +1,197 @@
1
+ #include "kernel_operator.h"
2
+
3
+ // optimize me. Use template to avoid copy code.
4
+ using namespace AscendC;
5
+
6
+ #define BUFFER_NUM 2
7
+
8
+ class GET_ROW_F16 {
9
+ public:
10
+ __aicore__ inline GET_ROW_F16() {}
11
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
+ int64_t *input_ne_ub, size_t *input_nb_ub,
13
+ int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
+ int64_t *output_ne_ub, size_t *output_nb_ub) {
15
+ // TODO, use template for F16/f32
16
+ int64_t op_block_num = GetBlockNum();
17
+ op_block_idx = GetBlockIdx();
18
+
19
+ for (int i = 0; i < 4; i++) {
20
+ input_ne[i] = input_ne_ub[i];
21
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
22
+
23
+ indices_ne[i] = indices_ne_ub[i];
24
+ indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
25
+
26
+ output_ne[i] = output_ne_ub[i];
27
+ output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
28
+ }
29
+
30
+ // Indices has two dims. n_elements = all rows should get.
31
+ // dr, all rows should this thread get.
32
+ uint64_t n_elements =
33
+ indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
34
+ dr = n_elements / op_block_num;
35
+
36
+ uint64_t tails = n_elements % op_block_num;
37
+ if (op_block_idx < tails) {
38
+ dr += 1;
39
+ ir = dr * op_block_idx;
40
+ } else {
41
+ ir = dr * op_block_idx + tails;
42
+ }
43
+
44
+ input_gm.SetGlobalBuffer((__gm__ half *)input);
45
+ indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
46
+ output_gm.SetGlobalBuffer((__gm__ float *)output);
47
+
48
+ uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
49
+ & ~31);
50
+ uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
51
+ & ~31);
52
+
53
+ local_buffer_elems = input_local_buffer_size / sizeof(half);
54
+
55
+ // TODO, consider long row that can't put in UB.
56
+ // All data should asign to 32. It's ok because all data is align to 32.
57
+ pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
58
+ pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
59
+ }
60
+
61
+ __aicore__ inline void copy_in(uint32_t offset, size_t len) {
62
+ size_t origin_len = len;
63
+ LocalTensor<half> input_local = input_queue.AllocTensor<half>();
64
+ const size_t elem_per_block = 32 / sizeof(half);
65
+ size_t tail = len % elem_per_block;
66
+ len = len & ~(elem_per_block - 1);
67
+ if(tail != 0) {
68
+ len += elem_per_block;
69
+ }
70
+ DataCopy(input_local, input_gm[offset], len);
71
+ input_queue.EnQue(input_local);
72
+ }
73
+
74
+ __aicore__ inline void copy_out(uint32_t offset, size_t len) {
75
+ LocalTensor<float> output_local = output_queue.DeQue<float>();
76
+ const size_t elem_per_block = 32 / sizeof(float);
77
+ size_t tail = len % elem_per_block;
78
+ len = len & ~(elem_per_block - 1);
79
+ if (len > 0) {
80
+ DataCopy(output_gm[offset], output_local, len);
81
+ }
82
+
83
+ if(tail != 0) {
84
+ #ifdef ASCEND_310P
85
+ for (size_t i = tail; i < elem_per_block; i++) {
86
+ output_local[len + i].SetValue(0, 0);
87
+ }
88
+ SetAtomicAdd<float>();
89
+ DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
90
+ SetAtomicNone();
91
+ #else
92
+ DataCopyExtParams dataCopyParams;
93
+ dataCopyParams.blockCount = 1;
94
+ dataCopyParams.blockLen = tail * sizeof(float);
95
+ DataCopyPad(output_gm[offset + len], output_local[len],
96
+ dataCopyParams);
97
+ #endif
98
+ }
99
+ output_queue.FreeTensor(output_local);
100
+ }
101
+
102
+ __aicore__ inline void calculate_row(int64_t idx) {
103
+ const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
104
+ const int64_t indices_ne1_idx =
105
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
106
+ indices_ne[0];
107
+ const int64_t indices_ne0_idx =
108
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
109
+ indices_ne1_idx * indices_ne[0]);
110
+
111
+ const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
112
+ indices_ne1_idx * indices_stride[1] +
113
+ indices_ne2_idx * indices_stride[2];
114
+ const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
115
+
116
+ const int64_t input_offset = selected_row_idx * input_stride[1] +
117
+ indices_ne1_idx * input_stride[2] +
118
+ indices_ne2_idx * input_stride[3];
119
+
120
+ const int64_t output_offset = indices_ne0_idx * output_stride[1] +
121
+ indices_ne1_idx * output_stride[2] +
122
+ indices_ne2_idx * output_stride[3];
123
+
124
+ copy_in(input_offset, input_ne[0]);
125
+ LocalTensor<half> input_local = input_queue.DeQue<half>();
126
+ LocalTensor<float> output_local = output_queue.AllocTensor<float>();
127
+
128
+ Cast(output_local, input_local, RoundMode::CAST_NONE,
129
+ local_buffer_elems);
130
+ output_queue.EnQue(output_local);
131
+ copy_out(output_offset, input_ne[0]);
132
+
133
+ input_queue.FreeTensor(input_local);
134
+ }
135
+
136
+ __aicore__ inline void calculate() {
137
+ for (int64_t i = ir; i < ir + dr; i++) {
138
+ calculate_row(i);
139
+ }
140
+ }
141
+
142
+ private:
143
+ int64_t input_ne[4];
144
+ size_t input_stride[4];
145
+
146
+ int64_t indices_ne[4];
147
+ size_t indices_stride[4];
148
+
149
+ int64_t output_ne[4];
150
+ size_t output_stride[4];
151
+
152
+ size_t local_buffer_elems;
153
+
154
+ int64_t ir;
155
+ int64_t dr;
156
+
157
+ TPipe pipe;
158
+ GlobalTensor<half> input_gm;
159
+ GlobalTensor<int32_t> indices_gm;
160
+ GlobalTensor<float> output_gm;
161
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
162
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
163
+ int64_t op_block_idx;
164
+ };
165
+
166
+ template <typename T>
167
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
168
+ auto gm_ptr = (__gm__ uint8_t *)gm;
169
+ auto ub_ptr = (uint8_t *)(ub);
170
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
171
+ *ub_ptr = *gm_ptr;
172
+ }
173
+ }
174
+
175
+ extern "C" __global__ __aicore__ void ascendc_get_row_f16(
176
+ GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
177
+ GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
178
+ GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
179
+ int64_t input_ne_ub[4];
180
+ size_t input_nb_ub[4];
181
+ int64_t indices_ne_ub[4];
182
+ size_t indices_nb_ub[4];
183
+ int64_t output_ne_ub[4];
184
+ size_t output_nb_ub[4];
185
+
186
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
187
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
188
+ copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
189
+ copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
190
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
191
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
192
+
193
+ GET_ROW_F16 op;
194
+ op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
195
+ indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
196
+ op.calculate();
197
+ }
@@ -0,0 +1,190 @@
1
+ #include "kernel_operator.h"
2
+
3
+ // optimize me. Use template to avoid copy code.
4
+ using namespace AscendC;
5
+
6
+ #define BUFFER_NUM 2
7
+
8
+ class GET_ROW_F32 {
9
+ public:
10
+ __aicore__ inline GET_ROW_F32() {}
11
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
+ int64_t *input_ne_ub, size_t *input_nb_ub,
13
+ int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
+ int64_t *output_ne_ub, size_t *output_nb_ub) {
15
+ int64_t op_block_num = GetBlockNum();
16
+ op_block_idx = GetBlockIdx();
17
+
18
+ for (int i = 0; i < 4; i++) {
19
+ input_ne[i] = input_ne_ub[i];
20
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
21
+
22
+ indices_ne[i] = indices_ne_ub[i];
23
+ indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
+
25
+ output_ne[i] = output_ne_ub[i];
26
+ output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
+ }
28
+
29
+ // Indices has two dims. n_elements = all rows should get.
30
+ // dr, all rows should this thread get.
31
+ uint64_t n_elements =
32
+ indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
33
+ dr = n_elements / op_block_num;
34
+
35
+ uint64_t tails = n_elements % op_block_num;
36
+ if (op_block_idx < tails) {
37
+ dr += 1;
38
+ ir = dr * op_block_idx;
39
+ } else {
40
+ ir = dr * op_block_idx + tails;
41
+ }
42
+
43
+ input_gm.SetGlobalBuffer((__gm__ float *)input);
44
+ indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
45
+ output_gm.SetGlobalBuffer((__gm__ float *)output);
46
+
47
+ uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
48
+ local_buffer_elems = local_buffer_size / sizeof(float);
49
+
50
+ // TODO, consider long row that can't put in UB.
51
+ // All data should asign to 32. It's ok because all data is align to 32.
52
+ pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
53
+ pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
54
+ }
55
+
56
+ __aicore__ inline void copy_in(uint32_t offset, size_t len) {
57
+ LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58
+ const size_t elem_per_block = 32 / sizeof(float);
59
+ size_t tail = len % elem_per_block;
60
+ len = len & ~(elem_per_block - 1);
61
+ if(tail != 0) {
62
+ len += elem_per_block;
63
+ }
64
+ DataCopy(input_local, input_gm[offset], len);
65
+ input_queue.EnQue(input_local);
66
+ }
67
+
68
+ __aicore__ inline void copy_out(uint32_t offset, size_t len) {
69
+ LocalTensor<float> output_local = output_queue.DeQue<float>();
70
+ const size_t elem_per_block = 32 / sizeof(float);
71
+ size_t tail = len % elem_per_block;
72
+ len = len & ~(elem_per_block - 1);
73
+ if (len > 0) {
74
+ DataCopy(output_gm[offset], output_local, len);
75
+ }
76
+
77
+ if(tail != 0) {
78
+ #ifdef ASCEND_310P
79
+ for (size_t i = tail; i < elem_per_block; i++) {
80
+ output_local[len + i].SetValue(0, 0);
81
+ }
82
+ SetAtomicAdd<float>();
83
+ DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
84
+ SetAtomicNone();
85
+ #else
86
+ DataCopyExtParams dataCopyParams;
87
+ dataCopyParams.blockCount = 1;
88
+ dataCopyParams.blockLen = tail * sizeof(float);
89
+ DataCopyPad(output_gm[offset + len], output_local[len],
90
+ dataCopyParams);
91
+ #endif
92
+ }
93
+ output_queue.FreeTensor(output_local);
94
+ }
95
+
96
+ __aicore__ inline void calculate_row(int64_t idx) {
97
+ const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
98
+ const int64_t indices_ne1_idx =
99
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
100
+ indices_ne[0];
101
+ const int64_t indices_ne0_idx =
102
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
103
+ indices_ne1_idx * indices_ne[0]);
104
+
105
+ const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
106
+ indices_ne1_idx * indices_stride[1] +
107
+ indices_ne2_idx * indices_stride[2];
108
+ const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
109
+
110
+ const int64_t input_offset = selected_row_idx * input_stride[1] +
111
+ indices_ne1_idx * input_stride[2] +
112
+ indices_ne2_idx * input_stride[3];
113
+
114
+ const int64_t output_offset = indices_ne0_idx * output_stride[1] +
115
+ indices_ne1_idx * output_stride[2] +
116
+ indices_ne2_idx * output_stride[3];
117
+
118
+ copy_in(input_offset, input_ne[0]);
119
+ LocalTensor<float> input_local = input_queue.DeQue<float>();
120
+ LocalTensor<float> output_local = output_queue.AllocTensor<float>();
121
+
122
+ DataCopy(output_local, input_local, local_buffer_elems);
123
+ output_queue.EnQue(output_local);
124
+ copy_out(output_offset, input_ne[0]);
125
+
126
+ input_queue.FreeTensor(input_local);
127
+ }
128
+
129
+ __aicore__ inline void calculate() {
130
+ for (int64_t i = ir; i < ir + dr; i++) {
131
+ calculate_row(i);
132
+ }
133
+ }
134
+
135
+ private:
136
+ int64_t input_ne[4];
137
+ size_t input_stride[4];
138
+
139
+ int64_t indices_ne[4];
140
+ size_t indices_stride[4];
141
+
142
+ int64_t output_ne[4];
143
+ size_t output_stride[4];
144
+
145
+ size_t local_buffer_elems;
146
+
147
+ int64_t ir;
148
+ int64_t dr;
149
+
150
+ TPipe pipe;
151
+ GlobalTensor<float> input_gm;
152
+ GlobalTensor<int32_t> indices_gm;
153
+ GlobalTensor<float> output_gm;
154
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
155
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
156
+ int64_t op_block_idx;
157
+ };
158
+
159
+ template <typename T>
160
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
161
+ auto gm_ptr = (__gm__ uint8_t *)gm;
162
+ auto ub_ptr = (uint8_t *)(ub);
163
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
164
+ *ub_ptr = *gm_ptr;
165
+ }
166
+ }
167
+
168
+ extern "C" __global__ __aicore__ void ascendc_get_row_f32(
169
+ GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
170
+ GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
171
+ GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
172
+ int64_t input_ne_ub[4];
173
+ size_t input_nb_ub[4];
174
+ int64_t indices_ne_ub[4];
175
+ size_t indices_nb_ub[4];
176
+ int64_t output_ne_ub[4];
177
+ size_t output_nb_ub[4];
178
+
179
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
+ copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
182
+ copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
183
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
184
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
185
+
186
+ GET_ROW_F32 op;
187
+ op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
188
+ indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
189
+ op.calculate();
190
+ }