whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,204 @@
1
+ #include "kernel_operator.h"
2
+
3
+ // optimize me. Use template to avoid copy code.
4
+ using namespace AscendC;
5
+ #ifdef ASCEND_310P // 310P not support 4bit get row
6
+ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
7
+ GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
8
+ GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
9
+ GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
10
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
11
+ printf("Ascend310P not support 4bit get row.\n");
12
+ }
13
+ #else
14
+
15
+ #define BUFFER_NUM 2
16
+
17
+ #define QK4_0 32
18
+
19
+ class GET_ROW_Q4_0 {
20
+ public:
21
+ __aicore__ inline GET_ROW_Q4_0() {}
22
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
23
+ int64_t *input_ne_ub, int64_t *indices_ne_ub,
24
+ size_t *indices_nb_ub, int64_t *output_ne_ub,
25
+ size_t *output_nb_ub) {
26
+ int64_t op_block_num = GetBlockNum();
27
+ int64_t op_block_idx = GetBlockIdx();
28
+
29
+ for (int i = 0; i < 4; i++) {
30
+ input_ne[i] = input_ne_ub[i];
31
+ indices_ne[i] = indices_ne_ub[i];
32
+ indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
33
+ scale_ne[i] = input_ne_ub[i];
34
+ output_ne[i] = output_ne_ub[i];
35
+ output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
36
+ }
37
+
38
+ // one scale for a group.
39
+ scale_ne[0] /= QK4_0;
40
+
41
+ input_stride[0] = 1;
42
+ scale_stride[0] = 1;
43
+ output_stride[0] = 1;
44
+ for (int i = 1; i < 4; i++) {
45
+ input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
46
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
47
+ }
48
+
49
+ group_size_in_row = input_ne[0] / QK4_0;
50
+ int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
51
+ input_ne[3] / 2;
52
+
53
+ // Indices has two dims. n_elements = all rows should get.
54
+ // dr, all rows should this thread get.
55
+ uint64_t n_elements =
56
+ indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
57
+ dr = n_elements / op_block_num;
58
+
59
+ uint64_t tails = n_elements % op_block_num;
60
+ if (op_block_idx < tails) {
61
+ dr += 1;
62
+ ir = dr * op_block_idx;
63
+ } else {
64
+ ir = dr * op_block_idx + tails;
65
+ }
66
+
67
+ input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
68
+ scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
69
+ indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
70
+ output_gm.SetGlobalBuffer((__gm__ float *)output);
71
+
72
+ pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
73
+ pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
74
+ pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
75
+ }
76
+
77
+ __aicore__ inline void copy_in(uint32_t offset) {
78
+ LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
79
+ // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
80
+ DataCopy(input_local, input_gm[offset], QK4_0);
81
+ input_queue.EnQue(input_local);
82
+ }
83
+
84
+ __aicore__ inline void copy_out(uint32_t offset) {
85
+ LocalTensor<float> output_local = output_queue.DeQue<float>();
86
+ DataCopy(output_gm[offset], output_local, QK4_0);
87
+ output_queue.FreeTensor(output_local);
88
+ }
89
+
90
+ __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
91
+ const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
92
+ const int64_t indices_ne1_idx =
93
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
94
+ indices_ne[0];
95
+ const int64_t indices_ne0_idx =
96
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
97
+ indices_ne1_idx * indices_ne[0]);
98
+
99
+ const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
100
+ indices_ne1_idx * indices_stride[1] +
101
+ indices_ne2_idx * indices_stride[2];
102
+ const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
103
+
104
+ const int64_t input_offset = selected_row_idx * input_stride[1] +
105
+ indices_ne1_idx * input_stride[2] +
106
+ indices_ne2_idx * input_stride[3] +
107
+ group * QK4_0;
108
+ const int64_t scale_offset = selected_row_idx * scale_stride[1] +
109
+ indices_ne1_idx * scale_stride[2] +
110
+ indices_ne2_idx * scale_stride[3] + group;
111
+ const int64_t output_offset = indices_ne0_idx * output_stride[1] +
112
+ indices_ne1_idx * output_stride[2] +
113
+ indices_ne2_idx * output_stride[3] +
114
+ group * QK4_0;
115
+
116
+ copy_in(input_offset);
117
+ LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
118
+ LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
119
+ LocalTensor<float> output_local = output_queue.AllocTensor<float>();
120
+
121
+ // TODO: cast more data to speed up.
122
+ Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
123
+ Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
124
+
125
+ // Only mul need compile by group.
126
+ half scale = scale_gm.GetValue(scale_offset);
127
+
128
+ Muls(output_local, output_local, (float)scale, QK4_0);
129
+
130
+ input_queue.FreeTensor(input_local);
131
+ cast_queue.FreeTensor(cast_local);
132
+ output_queue.EnQue(output_local);
133
+
134
+ copy_out(output_offset);
135
+ }
136
+
137
+ __aicore__ inline void calculate() {
138
+ for (int64_t i = ir; i < ir + dr; i++) {
139
+ for (int64_t j = 0; j < group_size_in_row; j++) {
140
+ calculate_group(i, j);
141
+ }
142
+ }
143
+ }
144
+
145
+ private:
146
+ int64_t input_ne[4];
147
+ size_t input_stride[4];
148
+
149
+ int64_t scale_ne[4];
150
+ size_t scale_stride[4];
151
+
152
+ int64_t indices_ne[4];
153
+ size_t indices_stride[4];
154
+
155
+ int64_t output_ne[4];
156
+ size_t output_stride[4];
157
+
158
+ int64_t ir;
159
+ int64_t dr;
160
+
161
+ int64_t group_size_in_row;
162
+
163
+ TPipe pipe;
164
+ GlobalTensor<int4b_t> input_gm;
165
+ GlobalTensor<half> scale_gm;
166
+ GlobalTensor<int32_t> indices_gm;
167
+ GlobalTensor<float> output_gm;
168
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
169
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
170
+ TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
171
+ };
172
+
173
+ template <typename T>
174
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
175
+ auto gm_ptr = (__gm__ uint8_t *)gm;
176
+ auto ub_ptr = (uint8_t *)(ub);
177
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
178
+ *ub_ptr = *gm_ptr;
179
+ }
180
+ }
181
+
182
+ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
183
+ GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
184
+ GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
185
+ GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
186
+ int64_t input_ne_ub[4];
187
+ int64_t indices_ne_ub[4];
188
+ size_t indices_nb_ub[4];
189
+ int64_t output_ne_ub[4];
190
+ size_t output_nb_ub[4];
191
+
192
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
193
+ copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
194
+ copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
195
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
196
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
197
+
198
+ GET_ROW_Q4_0 op;
199
+ op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
200
+ indices_nb_ub, output_ne_ub, output_nb_ub);
201
+ op.calculate();
202
+ }
203
+
204
+ #endif // #ifdef ASCEND_310P
@@ -0,0 +1,191 @@
1
+ #include "kernel_operator.h"
2
+
3
+ // optimize me. Use template to avoid copy code.
4
+ using namespace AscendC;
5
+
6
+ #define BUFFER_NUM 2
7
+
8
+ #define QK8_0 32
9
+
10
+ class GET_ROW_Q8_0 {
11
+ public:
12
+ __aicore__ inline GET_ROW_Q8_0() {}
13
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
14
+ int64_t *input_ne_ub, int64_t *indices_ne_ub,
15
+ size_t *indices_nb_ub, int64_t *output_ne_ub,
16
+ size_t *output_nb_ub) {
17
+ int64_t op_block_num = GetBlockNum();
18
+ int64_t op_block_idx = GetBlockIdx();
19
+
20
+ for (int i = 0; i < 4; i++) {
21
+ input_ne[i] = input_ne_ub[i];
22
+ indices_ne[i] = indices_ne_ub[i];
23
+ indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
+ scale_ne[i] = input_ne_ub[i];
25
+ output_ne[i] = output_ne_ub[i];
26
+ output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
+ }
28
+
29
+ // one scale for a group.
30
+ scale_ne[0] /= QK8_0;
31
+
32
+ input_stride[0] = 1;
33
+ scale_stride[0] = 1;
34
+ output_stride[0] = 1;
35
+ for (int i = 1; i < 4; i++) {
36
+ input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
37
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
38
+ }
39
+
40
+ group_size_in_row = input_ne[0] / QK8_0;
41
+ int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
42
+ input_ne[3] * sizeof(int8_t);
43
+
44
+ // Indices has two dims. n_elements = all rows should get.
45
+ // dr, all rows should this thread get.
46
+ uint64_t n_elements =
47
+ indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
48
+ dr = n_elements / op_block_num;
49
+
50
+ uint64_t tails = n_elements % op_block_num;
51
+ if (op_block_idx < tails) {
52
+ dr += 1;
53
+ ir = dr * op_block_idx;
54
+ } else {
55
+ ir = dr * op_block_idx + tails;
56
+ }
57
+
58
+ input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
59
+ scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
60
+ indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
61
+ output_gm.SetGlobalBuffer((__gm__ float *)output);
62
+
63
+ pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
64
+ pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
65
+ pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
66
+ }
67
+
68
+ __aicore__ inline void copy_in(uint32_t offset) {
69
+ LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
70
+ DataCopy(input_local, input_gm[offset], QK8_0);
71
+ input_queue.EnQue(input_local);
72
+ }
73
+
74
+ __aicore__ inline void copy_out(uint32_t offset) {
75
+ LocalTensor<float> output_local = output_queue.DeQue<float>();
76
+ DataCopy(output_gm[offset], output_local, QK8_0);
77
+ output_queue.FreeTensor(output_local);
78
+ }
79
+
80
+ __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
81
+ const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
82
+ const int64_t indices_ne1_idx =
83
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
84
+ indices_ne[0];
85
+ const int64_t indices_ne0_idx =
86
+ (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
87
+ indices_ne1_idx * indices_ne[0]);
88
+
89
+ const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
90
+ indices_ne1_idx * indices_stride[1] +
91
+ indices_ne2_idx * indices_stride[2];
92
+ const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
93
+
94
+ const int64_t input_offset = selected_row_idx * input_stride[1] +
95
+ indices_ne1_idx * input_stride[2] +
96
+ indices_ne2_idx * input_stride[3] +
97
+ group * QK8_0;
98
+ const int64_t scale_offset = selected_row_idx * scale_stride[1] +
99
+ indices_ne1_idx * scale_stride[2] +
100
+ indices_ne2_idx * scale_stride[3] + group;
101
+ const int64_t output_offset = indices_ne0_idx * output_stride[1] +
102
+ indices_ne1_idx * output_stride[2] +
103
+ indices_ne2_idx * output_stride[3] +
104
+ group * QK8_0;
105
+
106
+ copy_in(input_offset);
107
+ LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
108
+ LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
109
+ LocalTensor<float> output_local = output_queue.AllocTensor<float>();
110
+
111
+ // TODO: cast more data to speed up.
112
+ Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
113
+ Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
114
+
115
+ // Only mul need compile by group.
116
+ half scale = scale_gm.GetValue(scale_offset);
117
+ Muls(output_local, output_local, (float)scale, QK8_0);
118
+
119
+ input_queue.FreeTensor(input_local);
120
+ cast_queue.FreeTensor(cast_local);
121
+ output_queue.EnQue(output_local);
122
+
123
+ copy_out(output_offset);
124
+ }
125
+
126
+ __aicore__ inline void calculate() {
127
+ for (int64_t i = ir; i < ir + dr; i++) {
128
+ for (int64_t j = 0; j < group_size_in_row; j++) {
129
+ calculate_group(i, j);
130
+ }
131
+ }
132
+ }
133
+
134
+ private:
135
+ int64_t input_ne[4];
136
+ size_t input_stride[4];
137
+
138
+ int64_t scale_ne[4];
139
+ size_t scale_stride[4];
140
+
141
+ int64_t indices_ne[4];
142
+ size_t indices_stride[4];
143
+
144
+ int64_t output_ne[4];
145
+ size_t output_stride[4];
146
+
147
+ int64_t ir;
148
+ int64_t dr;
149
+
150
+ int64_t group_size_in_row;
151
+
152
+ TPipe pipe;
153
+ GlobalTensor<int8_t> input_gm;
154
+ GlobalTensor<half> scale_gm;
155
+ GlobalTensor<int32_t> indices_gm;
156
+ GlobalTensor<float> output_gm;
157
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
158
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
159
+ TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
160
+ };
161
+
162
+ template <typename T>
163
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
164
+ auto gm_ptr = (__gm__ uint8_t *)gm;
165
+ auto ub_ptr = (uint8_t *)(ub);
166
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
167
+ *ub_ptr = *gm_ptr;
168
+ }
169
+ }
170
+
171
+ extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
172
+ GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
173
+ GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
174
+ GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
175
+ int64_t input_ne_ub[4];
176
+ int64_t indices_ne_ub[4];
177
+ size_t indices_nb_ub[4];
178
+ int64_t output_ne_ub[4];
179
+ size_t output_nb_ub[4];
180
+
181
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
182
+ copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
183
+ copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
184
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
185
+ copy_to_ub(output_nb_gm, output_nb_ub, 32);
186
+
187
+ GET_ROW_Q8_0 op;
188
+ op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
189
+ indices_nb_ub, output_ne_ub, output_nb_ub);
190
+ op.calculate();
191
+ }
@@ -0,0 +1,218 @@
1
+ #include "kernel_operator.h"
2
+
3
+ using namespace AscendC;
4
+ #ifdef ASCEND_310P
5
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
6
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
+ printf("Ascend310P not support f16->8bit quantization.\n");
10
+ }
11
+ #else
12
+
13
+ #define BUFFER_NUM 2
14
+ #define QK8_0 32
15
+
16
+ class QUANTIZE_F16_Q8_0 {
17
+ public:
18
+ __aicore__ inline QUANTIZE_F16_Q8_0() {}
19
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
20
+ int64_t *input_ne_ub, size_t *input_nb_ub,
21
+ int64_t *output_ne_ub) {
22
+ int64_t op_block_num = GetBlockNum();
23
+ int64_t op_block_idx = GetBlockIdx();
24
+
25
+ for (int i = 0; i < 4; i++) {
26
+ input_ne[i] = input_ne_ub[i];
27
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
28
+
29
+ output_ne[i] = output_ne_ub[i];
30
+ }
31
+
32
+ output_stride[0] = 1;
33
+ for (int i = 1; i < 4; i++) {
34
+ output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
35
+ }
36
+
37
+ scale_ne = input_ne;
38
+ scale_stride[0] = 1;
39
+ scale_stride[1] = input_ne[0] / QK8_0;
40
+ for (int i = 2; i < 4; i++) {
41
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
42
+ }
43
+
44
+ // split input tensor by rows.
45
+ uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
46
+ dr = nr / op_block_num;
47
+
48
+ uint64_t tails = nr % op_block_num;
49
+ if (op_block_idx < tails) {
50
+ dr += 1;
51
+ ir = dr * op_block_idx;
52
+ } else {
53
+ ir = dr * op_block_idx + tails;
54
+ }
55
+
56
+ group_size_in_row = scale_stride[1];
57
+ int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
58
+ output_ne[3] * sizeof(uint8_t);
59
+
60
+ input_gm.SetGlobalBuffer((__gm__ half *)input);
61
+ output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
62
+ scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
63
+ group_size_in_row *
64
+ sizeof(half)));
65
+
66
+ pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
67
+ pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
68
+ pipe.InitBuffer(work_queue, 1, 32);
69
+ pipe.InitBuffer(max_queue, 1, 32);
70
+ pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
71
+ pipe.InitBuffer(scale_queue, 1, 32);
72
+ pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
73
+ }
74
+
75
+ __aicore__ inline void copy_in(uint32_t offset) {
76
+ LocalTensor<half> input_local = input_queue.AllocTensor<half>();
77
+ DataCopy(input_local, input_gm[offset], QK8_0);
78
+ input_queue.EnQue(input_local);
79
+ }
80
+
81
+ __aicore__ inline void copy_out(uint32_t offset) {
82
+ LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
83
+ DataCopy(output_gm[offset], output_local, QK8_0);
84
+ output_queue.FreeTensor(output_local);
85
+ }
86
+
87
+ __aicore__ inline half calculate_group(int64_t row, int64_t group) {
88
+ const int64_t i3 = row / (input_ne[1] * input_ne[2]);
89
+ const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
90
+ const int64_t i1 =
91
+ row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
92
+
93
+ const int64_t input_offset = i1 * input_stride[1] +
94
+ i2 * input_stride[2] +
95
+ i3 * input_stride[3] + QK8_0 * group;
96
+
97
+ const int64_t output_offset = i1 * output_stride[1] +
98
+ i2 * output_stride[2] +
99
+ i3 * output_stride[3] + QK8_0 * group;
100
+
101
+ copy_in(input_offset);
102
+ LocalTensor<half> input_local = input_queue.DeQue<half>();
103
+ LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
104
+ LocalTensor<float> work_local = work_queue.AllocTensor<float>();
105
+ LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
106
+ LocalTensor<float> max_local = max_queue.AllocTensor<float>();
107
+ LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
108
+
109
+ Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
110
+ Abs(abs_local, cast_local, QK8_0);
111
+ ReduceMax(max_local, abs_local, work_local, QK8_0);
112
+
113
+ pipe_barrier(PIPE_ALL);
114
+ float d = max_local.GetValue(0);
115
+ d = d / ((1 << 7) - 1);
116
+ if (d != 0) {
117
+ Muls(cast_local, cast_local, 1.0f / d, QK8_0);
118
+ }
119
+
120
+ Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
121
+ Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
122
+ Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
123
+ output_queue.EnQue(output_local);
124
+ copy_out(output_offset);
125
+
126
+ input_queue.FreeTensor(input_local);
127
+ work_queue.FreeTensor(work_local);
128
+ abs_queue.FreeTensor(abs_local);
129
+ max_queue.FreeTensor(max_local);
130
+ cast_queue.FreeTensor(cast_local);
131
+ return (half)d;
132
+ }
133
+
134
+ __aicore__ inline void calculate() {
135
+ LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
136
+ uint32_t scale_local_offset = 0;
137
+ uint32_t scale_global_offset = 0;
138
+ for (int64_t i = ir; i < ir + dr; i++) {
139
+ for (int64_t j = 0; j < group_size_in_row; j++) {
140
+ half scale = calculate_group(i, j);
141
+ scale_local.SetValue(scale_local_offset++, scale);
142
+ if (scale_local_offset == 16) {
143
+ scale_local_offset = 0;
144
+ // TODO: OPTIMIZE ME
145
+ pipe_barrier(PIPE_ALL);
146
+ DataCopy(scale_gm[scale_global_offset], scale_local, 16);
147
+ pipe_barrier(PIPE_ALL);
148
+ scale_global_offset += 16;
149
+ }
150
+ }
151
+ }
152
+
153
+ if (scale_local_offset != 0) {
154
+ pipe_barrier(PIPE_ALL);
155
+ DataCopyExtParams dataCopyParams;
156
+ dataCopyParams.blockCount = 1;
157
+ dataCopyParams.blockLen = scale_local_offset * sizeof(half);
158
+ DataCopyPad(scale_gm[scale_global_offset], scale_local,
159
+ dataCopyParams);
160
+ pipe_barrier(PIPE_ALL);
161
+ }
162
+ }
163
+
164
+ private:
165
+ int64_t input_ne[4];
166
+ size_t input_stride[4];
167
+
168
+ int64_t *scale_ne;
169
+ size_t scale_stride[4];
170
+
171
+ int64_t output_ne[4];
172
+ size_t output_stride[4];
173
+
174
+ int64_t group_size_in_row;
175
+
176
+ int64_t ir;
177
+ int64_t dr;
178
+
179
+ TPipe pipe;
180
+ GlobalTensor<half> input_gm;
181
+ GlobalTensor<half> scale_gm;
182
+ GlobalTensor<int8_t> output_gm;
183
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
184
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
185
+ TQue<QuePosition::VECIN, 1> work_queue;
186
+ TQue<QuePosition::VECOUT, 1> max_queue;
187
+ TQue<QuePosition::VECIN, 1> abs_queue;
188
+ TQue<QuePosition::VECOUT, 1> scale_queue;
189
+ TQue<QuePosition::VECOUT, 1> cast_queue;
190
+
191
+ };
192
+
193
+ template <typename T>
194
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
195
+ auto gm_ptr = (__gm__ uint8_t *)gm;
196
+ auto ub_ptr = (uint8_t *)(ub);
197
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
198
+ *ub_ptr = *gm_ptr;
199
+ }
200
+ }
201
+
202
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
203
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
204
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
205
+ int64_t input_ne_ub[4];
206
+ size_t input_nb_ub[4];
207
+ int64_t output_ne_ub[4];
208
+
209
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
210
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
211
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
212
+
213
+ QUANTIZE_F16_Q8_0 op;
214
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
215
+ op.calculate();
216
+ }
217
+
218
+ #endif // #ifdef ASCEND_310P