whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,216 @@
1
+ #include "kernel_operator.h"
2
+
3
+ using namespace AscendC;
4
+ #ifdef ASCEND_310P // 310P not support f32->8bit quantization
5
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
6
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
+ printf("Ascend310P not support f32->8bit quantization.\n");
10
+ }
11
+ #else
12
+
13
+ #define BUFFER_NUM 2
14
+ #define QK8_0 32
15
+
16
+ class QUANTIZE_F32_Q8_0 {
17
+ public:
18
+ __aicore__ inline QUANTIZE_F32_Q8_0() {}
19
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
20
+ int64_t *input_ne_ub, size_t *input_nb_ub,
21
+ int64_t *output_ne_ub) {
22
+ int64_t op_block_num = GetBlockNum();
23
+ int64_t op_block_idx = GetBlockIdx();
24
+
25
+ for (int i = 0; i < 4; i++) {
26
+ input_ne[i] = input_ne_ub[i];
27
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
28
+
29
+ output_ne[i] = output_ne_ub[i];
30
+ }
31
+
32
+ output_stride[0] = 1;
33
+ for (int i = 1; i < 4; i++) {
34
+ output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
35
+ }
36
+
37
+ scale_ne = input_ne;
38
+ scale_stride[0] = 1;
39
+ scale_stride[1] = input_ne[0] / QK8_0;
40
+ for (int i = 2; i < 4; i++) {
41
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
42
+ }
43
+
44
+ // split input tensor by rows.
45
+ uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
46
+ dr = nr / op_block_num;
47
+
48
+ uint64_t tails = nr % op_block_num;
49
+ if (op_block_idx < tails) {
50
+ dr += 1;
51
+ ir = dr * op_block_idx;
52
+ } else {
53
+ ir = dr * op_block_idx + tails;
54
+ }
55
+
56
+ group_size_in_row = scale_stride[1];
57
+ int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
58
+ output_ne[3] * sizeof(uint8_t);
59
+
60
+ input_gm.SetGlobalBuffer((__gm__ float *)input);
61
+ output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
62
+ scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
63
+ ir * group_size_in_row *
64
+ sizeof(half)));
65
+
66
+ pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
67
+ pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
68
+ pipe.InitBuffer(work_queue, 1, 32);
69
+ pipe.InitBuffer(max_queue, 1, 32);
70
+ pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
71
+ pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
72
+ pipe.InitBuffer(scale_queue, 1, 32);
73
+ }
74
+
75
+ __aicore__ inline void copy_in(uint32_t offset) {
76
+ LocalTensor<float> input_local = input_queue.AllocTensor<float>();
77
+ DataCopy(input_local, input_gm[offset], QK8_0);
78
+ input_queue.EnQue(input_local);
79
+ }
80
+
81
+ __aicore__ inline void copy_out(uint32_t offset) {
82
+ LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
83
+ DataCopy(output_gm[offset], output_local, QK8_0);
84
+ output_queue.FreeTensor(output_local);
85
+ }
86
+
87
+ __aicore__ inline half calculate_group(int64_t row, int64_t group) {
88
+ const int64_t i3 = row / (input_ne[1] * input_ne[2]);
89
+ const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
90
+ const int64_t i1 =
91
+ row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
92
+
93
+ const int64_t input_offset = i1 * input_stride[1] +
94
+ i2 * input_stride[2] +
95
+ i3 * input_stride[3] + QK8_0 * group;
96
+
97
+ const int64_t output_offset = i1 * output_stride[1] +
98
+ i2 * output_stride[2] +
99
+ i3 * output_stride[3] + QK8_0 * group;
100
+
101
+ copy_in(input_offset);
102
+ LocalTensor<float> input_local = input_queue.DeQue<float>();
103
+ LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
104
+ LocalTensor<float> work_local = work_queue.AllocTensor<float>();
105
+ LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
106
+ LocalTensor<float> max_local = max_queue.AllocTensor<float>();
107
+ LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
108
+
109
+ Abs(abs_local, input_local, QK8_0);
110
+ ReduceMax(max_local, abs_local, work_local, QK8_0);
111
+ pipe_barrier(PIPE_ALL);
112
+ float d = max_local.GetValue(0);
113
+ d = d / ((1 << 7) - 1);
114
+ if (d != 0) {
115
+ Muls(input_local, input_local, 1.0f / d, QK8_0);
116
+ }
117
+
118
+ Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
119
+ Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
120
+ Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
121
+ output_queue.EnQue(output_local);
122
+ copy_out(output_offset);
123
+
124
+ input_queue.FreeTensor(input_local);
125
+ work_queue.FreeTensor(work_local);
126
+ abs_queue.FreeTensor(abs_local);
127
+ max_queue.FreeTensor(max_local);
128
+ cast_queue.FreeTensor(cast_local);
129
+
130
+ return (half)d;
131
+ }
132
+
133
+ __aicore__ inline void calculate() {
134
+ LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
135
+ uint32_t scale_local_offset = 0;
136
+ uint32_t scale_global_offset = 0;
137
+ for (int64_t i = ir; i < ir + dr; i++) {
138
+ for (int64_t j = 0; j < group_size_in_row; j++) {
139
+ half scale = calculate_group(i, j);
140
+ scale_local.SetValue(scale_local_offset++, scale);
141
+ if (scale_local_offset == 16) {
142
+ scale_local_offset = 0;
143
+ // TODO: OPTIMIZE ME
144
+ pipe_barrier(PIPE_ALL);
145
+ DataCopy(scale_gm[scale_global_offset], scale_local, 16);
146
+ pipe_barrier(PIPE_ALL);
147
+ scale_global_offset += 16;
148
+ }
149
+ }
150
+ }
151
+
152
+ if (scale_local_offset != 0) {
153
+ pipe_barrier(PIPE_ALL);
154
+ DataCopyExtParams dataCopyParams;
155
+ dataCopyParams.blockCount = 1;
156
+ dataCopyParams.blockLen = scale_local_offset * sizeof(half);
157
+ DataCopyPad(scale_gm[scale_global_offset], scale_local,
158
+ dataCopyParams);
159
+ pipe_barrier(PIPE_ALL);
160
+ }
161
+ }
162
+
163
+ private:
164
+ int64_t input_ne[4];
165
+ size_t input_stride[4];
166
+
167
+ int64_t *scale_ne;
168
+ size_t scale_stride[4];
169
+
170
+ int64_t output_ne[4];
171
+ size_t output_stride[4];
172
+
173
+ int64_t group_size_in_row;
174
+
175
+ int64_t ir;
176
+ int64_t dr;
177
+
178
+ TPipe pipe;
179
+ GlobalTensor<float> input_gm;
180
+ GlobalTensor<half> scale_gm;
181
+ GlobalTensor<int8_t> output_gm;
182
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
183
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
184
+ TQue<QuePosition::VECIN, 1> work_queue;
185
+ TQue<QuePosition::VECOUT, 1> max_queue;
186
+ TQue<QuePosition::VECIN, 1> abs_queue;
187
+ TQue<QuePosition::VECIN, 1> cast_queue;
188
+ TQue<QuePosition::VECOUT, 1> scale_queue;
189
+ };
190
+
191
+ template <typename T>
192
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
193
+ auto gm_ptr = (__gm__ uint8_t *)gm;
194
+ auto ub_ptr = (uint8_t *)(ub);
195
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
196
+ *ub_ptr = *gm_ptr;
197
+ }
198
+ }
199
+
200
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
201
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
202
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
203
+ int64_t input_ne_ub[4];
204
+ size_t input_nb_ub[4];
205
+ int64_t output_ne_ub[4];
206
+
207
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
208
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
209
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
210
+
211
+ QUANTIZE_F32_Q8_0 op;
212
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
213
+ op.calculate();
214
+ }
215
+
216
+ #endif // #ifdef ASCEND_310P
@@ -0,0 +1,295 @@
1
+ #include "kernel_operator.h"
2
+
3
+ using namespace AscendC;
4
+ #ifdef ASCEND_310P // 310P not support float->4bit quantization
5
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
6
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
+ printf("Ascend310P not support f32->4bit quantization.\n");
10
+ }
11
+
12
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
13
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
14
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
15
+ // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
16
+ printf("Ascend310P not support f16->4bit quantization.\n");
17
+ }
18
+ #else
19
+
20
+ #define BUFFER_NUM 2
21
+ #define Group_Size 32
22
+
23
+ template <typename SRC_T>
24
+ class QUANTIZE_FLOAT_TO_Q4_0 {
25
+ public:
26
+ __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
27
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
28
+ int64_t *input_ne_ub, size_t *input_nb_ub,
29
+ int64_t *output_ne_ub) {
30
+ // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
31
+ // permute=[0,0,0,0]):
32
+ // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
33
+ int64_t op_block_num = GetBlockNum();
34
+ int64_t op_block_idx = GetBlockIdx();
35
+
36
+ // input stride of data elements
37
+ for (int i = 0; i < 4; i++) {
38
+ input_ne[i] = input_ne_ub[i];
39
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
40
+ output_ne[i] = output_ne_ub[i];
41
+ }
42
+
43
+ // output stride of data elements
44
+ output_stride[0] = 1;
45
+ for (int i = 1; i < 4; i++) {
46
+ output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
47
+ }
48
+
49
+ // scale saved one by one after data:. [group1_scale, group2_scale, ...]
50
+ scale_ne = input_ne;
51
+ scale_stride[0] = 1;
52
+ scale_stride[1] = input_ne[0] / Group_Size;
53
+ for (int i = 2; i < 4; i++) {
54
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
55
+ }
56
+
57
+ // split input tensor by rows.
58
+ uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
59
+ dr = nr / op_block_num;
60
+
61
+ uint64_t tails = nr % op_block_num;
62
+ if (op_block_idx < tails) {
63
+ dr += 1;
64
+ ir = dr * op_block_idx;
65
+ } else {
66
+ ir = dr * op_block_idx + tails;
67
+ }
68
+
69
+ group_size_in_row = scale_stride[1];
70
+ int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
71
+ output_ne[3] * sizeof(uint8_t) / 2;
72
+
73
+ input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
74
+ output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
75
+ scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
76
+ group_size_in_row *
77
+ sizeof(half)));
78
+
79
+ pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
80
+ pipe.InitBuffer(output_queue, BUFFER_NUM,
81
+ Group_Size * sizeof(int8_t) / 2);
82
+ pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
83
+ pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
84
+ pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
85
+ pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
86
+ pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
87
+ pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
88
+ pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
89
+ }
90
+
91
+ __aicore__ inline void copy_in(uint32_t offset) {
92
+ LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
93
+ DataCopy(input_local, input_gm[offset], Group_Size);
94
+ input_queue.EnQue(input_local);
95
+ }
96
+
97
+ __aicore__ inline void copy_out(uint32_t offset) {
98
+ // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
99
+ // and using DataCopyPad to avoid 32 bits align.
100
+ LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
101
+ LocalTensor<int8_t> output_int8_local =
102
+ output_local.ReinterpretCast<int8_t>();
103
+
104
+ DataCopyExtParams dataCopyParams;
105
+ dataCopyParams.blockCount = 1;
106
+ dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
107
+ DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
108
+
109
+ output_queue.FreeTensor(output_local);
110
+ }
111
+
112
+ __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
113
+ LocalTensor<float> input_local) {
114
+ DataCopy(cast_local, input_local, Group_Size);
115
+ }
116
+
117
+ __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
118
+ LocalTensor<half> input_local) {
119
+ Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
120
+ }
121
+
122
+ __aicore__ inline half calculate_group(int64_t row, int64_t group) {
123
+ const int64_t i3 = row / (input_ne[1] * input_ne[2]);
124
+ const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
125
+ const int64_t i1 =
126
+ row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
127
+
128
+ const int64_t input_offset = i1 * input_stride[1] +
129
+ i2 * input_stride[2] +
130
+ i3 * input_stride[3] + Group_Size * group;
131
+
132
+ // output_offset is stride for output_gm which datatype is int8_t and
133
+ // divided by 2 is needed for int4b_t.
134
+ const int64_t output_offset = (i1 * output_stride[1] +
135
+ i2 * output_stride[2] +
136
+ i3 * output_stride[3] +
137
+ Group_Size * group) / 2;
138
+ copy_in(input_offset);
139
+
140
+ LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
141
+ LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
142
+ LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
143
+ LocalTensor<float> work_local = work_queue.AllocTensor<float>();
144
+ LocalTensor<float> max_local = max_queue.AllocTensor<float>();
145
+ LocalTensor<float> min_local = min_queue.AllocTensor<float>();
146
+ LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
147
+ LocalTensor<half> half_local = half_queue.AllocTensor<half>();
148
+
149
+ input_to_cast(cast_local, input_local);
150
+
151
+ ReduceMax(max_local, cast_local, work_local, Group_Size);
152
+ ReduceMin(min_local, cast_local, work_local, Group_Size);
153
+ const float max_value = max_local.GetValue(0);
154
+ const float min_value = min_local.GetValue(0);
155
+ float d = max_value;
156
+ if (min_value < 0 && (-1 * min_value) > max_value) {
157
+ d = min_value;
158
+ }
159
+
160
+ d = d / (-8);
161
+ if (d != 0) {
162
+ Muls(cast_local, cast_local, 1.0f / d, Group_Size);
163
+ }
164
+
165
+ // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
166
+ float scalar = 8.5f;
167
+ Adds(cast_local, cast_local, scalar, Group_Size);
168
+ Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
169
+ scalar = 15.0f;
170
+ Mins(cast_local, cast_local, scalar, Group_Size);
171
+ scalar = -8.0f;
172
+ Adds(cast_local, cast_local, scalar, Group_Size);
173
+
174
+ // float->half->int4b
175
+ Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
176
+ Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
177
+
178
+ output_queue.EnQue(output_local);
179
+ copy_out(output_offset);
180
+
181
+ input_queue.FreeTensor(input_local);
182
+ work_queue.FreeTensor(work_local);
183
+ max_queue.FreeTensor(max_local);
184
+ min_queue.FreeTensor(min_local);
185
+ int8_queue.FreeTensor(int8_local);
186
+ half_queue.FreeTensor(half_local);
187
+ cast_queue.FreeTensor(cast_local);
188
+ return (half)d;
189
+ }
190
+
191
+ __aicore__ inline void calculate() {
192
+ LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
193
+ uint32_t scale_local_offset = 0;
194
+ uint32_t scale_global_offset = 0;
195
+ for (int64_t i = ir; i < ir + dr; i++) {
196
+ for (int64_t j = 0; j < group_size_in_row; j++) {
197
+ half scale = calculate_group(i, j);
198
+ scale_local.SetValue(scale_local_offset++, scale);
199
+ // Copy Group_Size/2 length data each time.
200
+ if (scale_local_offset == Group_Size / 2) {
201
+ scale_local_offset = 0;
202
+ // TODO: OPTIMIZE ME
203
+ pipe_barrier(PIPE_ALL);
204
+ DataCopy(scale_gm[scale_global_offset], scale_local,
205
+ Group_Size / 2);
206
+ pipe_barrier(PIPE_ALL);
207
+ scale_global_offset += Group_Size / 2;
208
+ }
209
+ }
210
+ }
211
+
212
+ if (scale_local_offset != 0) {
213
+ pipe_barrier(PIPE_ALL);
214
+ DataCopyExtParams dataCopyParams;
215
+ dataCopyParams.blockCount = 1;
216
+ dataCopyParams.blockLen = scale_local_offset * sizeof(half);
217
+ DataCopyPad(scale_gm[scale_global_offset], scale_local,
218
+ dataCopyParams);
219
+ pipe_barrier(PIPE_ALL);
220
+ }
221
+ scale_queue.FreeTensor(scale_local);
222
+ }
223
+
224
+ private:
225
+ int64_t input_ne[4];
226
+ size_t input_stride[4];
227
+
228
+ int64_t *scale_ne;
229
+ size_t scale_stride[4];
230
+
231
+ int64_t output_ne[4];
232
+ size_t output_stride[4];
233
+
234
+ int64_t group_size_in_row;
235
+
236
+ int64_t ir;
237
+ int64_t dr;
238
+
239
+ TPipe pipe;
240
+ GlobalTensor<SRC_T> input_gm;
241
+ GlobalTensor<half> scale_gm;
242
+ GlobalTensor<int8_t> output_gm;
243
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
244
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
245
+ TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
246
+ TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
247
+ TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
248
+ TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
249
+ TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
250
+ TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
251
+ TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
252
+ };
253
+
254
+ template <typename T>
255
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
256
+ auto gm_ptr = (__gm__ uint8_t *)gm;
257
+ auto ub_ptr = (uint8_t *)(ub);
258
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
259
+ *ub_ptr = *gm_ptr;
260
+ }
261
+ }
262
+
263
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
264
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
265
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
266
+ int64_t input_ne_ub[4];
267
+ size_t input_nb_ub[4];
268
+ int64_t output_ne_ub[4];
269
+
270
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
271
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
272
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
273
+
274
+ QUANTIZE_FLOAT_TO_Q4_0<half> op;
275
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
276
+ op.calculate();
277
+ }
278
+
279
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
280
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
281
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
282
+ int64_t input_ne_ub[4];
283
+ size_t input_nb_ub[4];
284
+ int64_t output_ne_ub[4];
285
+
286
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
287
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
288
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
289
+
290
+ QUANTIZE_FLOAT_TO_Q4_0<float> op;
291
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
292
+ op.calculate();
293
+ }
294
+
295
+ #endif // #ifdef ASCEND_310P