@fugood/llama.node 1.4.3 → 1.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/lib/index.js +9 -0
- package/lib/index.ts +10 -0
- package/package.json +15 -15
- package/src/LlamaContext.cpp +24 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/arg.cpp +19 -7
- package/src/llama.cpp/common/common.cpp +46 -2
- package/src/llama.cpp/common/common.h +7 -0
- package/src/llama.cpp/common/log.cpp +3 -26
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +38 -11
- package/src/llama.cpp/src/llama-model.cpp +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +0 -29
- package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
- package/src/llama.cpp/src/unicode.cpp +2 -2
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
typedef vector unsigned char vec_t;
|
|
4
|
+
typedef __vector_quad acc_t;
|
|
5
|
+
|
|
6
|
+
template <typename TA>
|
|
7
|
+
class tinyBLAS_Q0_PPC {
|
|
8
|
+
public:
|
|
9
|
+
tinyBLAS_Q0_PPC(int64_t k,
|
|
10
|
+
const TA *A, int64_t lda,
|
|
11
|
+
const block_q8_0 *B, int64_t ldb,
|
|
12
|
+
float *C, int64_t ldc,
|
|
13
|
+
int ith, int nth);
|
|
14
|
+
|
|
15
|
+
void matmul(int64_t m, int64_t n);
|
|
16
|
+
void matmul_tiled_q0(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
|
|
17
|
+
vec_t A_pack[mc*kc*2];
|
|
18
|
+
vec_t B_pack[nc*kc*2];
|
|
19
|
+
int comparray[mc*kc];
|
|
20
|
+
constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
|
|
21
|
+
int64_t ytiles = m / mc;
|
|
22
|
+
int64_t xtiles = n / nc;
|
|
23
|
+
int64_t tiles = xtiles * ytiles;
|
|
24
|
+
int64_t duty = (tiles + nth - 1) / nth;
|
|
25
|
+
int64_t start = duty * ith;
|
|
26
|
+
int64_t end = start + duty;
|
|
27
|
+
if (end > tiles) {
|
|
28
|
+
end = tiles;
|
|
29
|
+
}
|
|
30
|
+
for (int64_t job = start; job < end; ++job) {
|
|
31
|
+
int64_t ii = (job / xtiles) * mc;
|
|
32
|
+
int64_t jj = (job % xtiles) * nc;
|
|
33
|
+
for (int64_t kk = 0; kk < k; kk += kc) {
|
|
34
|
+
if constexpr(is_Ablock_q4) {
|
|
35
|
+
packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
|
|
36
|
+
} else {
|
|
37
|
+
packNormal_large<int8_t, vector signed char>(A + ii*lda + kk, lda, mc, 8, (int8_t*)A_pack, false, comparray);
|
|
38
|
+
}
|
|
39
|
+
packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
|
|
40
|
+
KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
private:
|
|
46
|
+
inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
|
47
|
+
for (int I = 0; I < RM; I++) {
|
|
48
|
+
for (int J = 0; J < RN; J++) {
|
|
49
|
+
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
|
55
|
+
for (int I = 0; I < RM; I++) {
|
|
56
|
+
for (int J = 0; J < RN; J++) {
|
|
57
|
+
float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
|
|
58
|
+
*c_ptr += *((float*)&fin_res[idx+I]+J);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
template<typename ArrayType>
|
|
64
|
+
inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
|
|
65
|
+
vector signed int vec_C[4];
|
|
66
|
+
vector float CA[4] = {0};
|
|
67
|
+
vector float res[4] = {0};
|
|
68
|
+
__builtin_mma_disassemble_acc(vec_C, ACC);
|
|
69
|
+
for (int i = 0; i < 4; i++) {
|
|
70
|
+
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
|
71
|
+
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
|
72
|
+
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
|
77
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
78
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
79
|
+
const vector signed char v8 = vec_splats((signed char)0x8);
|
|
80
|
+
vector signed int vsum = {0};
|
|
81
|
+
vector signed int vsum2 = {0};
|
|
82
|
+
c[0] = vec_and(c[1], lowMask);
|
|
83
|
+
c[1] = vec_sr(c[1], v4);
|
|
84
|
+
c[0] = vec_sub(c[0], v8);
|
|
85
|
+
c[1] = vec_sub(c[1], v8);
|
|
86
|
+
vsum = vec_sum4s(c[0], vsum);
|
|
87
|
+
vsum2 = vec_sum4s(c[1], vsum2);
|
|
88
|
+
vsum = vec_add(vsum, vsum2);
|
|
89
|
+
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
template <typename V1, typename V2>
|
|
93
|
+
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
|
94
|
+
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
|
95
|
+
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
96
|
+
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
|
97
|
+
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
|
98
|
+
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
|
99
|
+
vector unsigned char xor_vector;
|
|
100
|
+
uint8_t flip_vec = 0x80;
|
|
101
|
+
xor_vector = vec_splats(flip_vec);
|
|
102
|
+
t1 = vec_perm(s1, s2, swiz1);
|
|
103
|
+
t2 = vec_perm(s1, s2, swiz2);
|
|
104
|
+
t3 = vec_perm(s3, s4, swiz1);
|
|
105
|
+
t4 = vec_perm(s3, s4, swiz2);
|
|
106
|
+
t5 = vec_perm(t1, t3, swiz3);
|
|
107
|
+
t6 = vec_perm(t1, t3, swiz4);
|
|
108
|
+
t7 = vec_perm(t2, t4, swiz3);
|
|
109
|
+
t8 = vec_perm(t2, t4, swiz4);
|
|
110
|
+
if (flip == true) {
|
|
111
|
+
t5 = vec_xor(t5, xor_vector);
|
|
112
|
+
t6 = vec_xor(t6, xor_vector);
|
|
113
|
+
t7 = vec_xor(t7, xor_vector);
|
|
114
|
+
t8 = vec_xor(t8, xor_vector);
|
|
115
|
+
}
|
|
116
|
+
vec_xst(t5, 0, vecOffset);
|
|
117
|
+
vec_xst(t6, 0, vecOffset+16);
|
|
118
|
+
vec_xst(t7, 0, vecOffset+32);
|
|
119
|
+
vec_xst(t8, 0, vecOffset+48);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
template<int RM, int RN>
|
|
123
|
+
inline void kernel(int64_t ii, int64_t jj) {
|
|
124
|
+
if constexpr(RM == 4 && RN == 8) {
|
|
125
|
+
KERNEL_4x8(ii,jj);
|
|
126
|
+
} else if constexpr(RM == 8 && RN == 4) {
|
|
127
|
+
KERNEL_8x4(ii,jj);
|
|
128
|
+
} else if constexpr(RM == 8 && RN == 8) {
|
|
129
|
+
KERNEL_8x8(ii,jj);
|
|
130
|
+
} else {
|
|
131
|
+
assert(false && "RN/RM values not supported");
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
template<int size>
|
|
135
|
+
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray);
|
|
136
|
+
template<typename VA, typename VB>
|
|
137
|
+
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip);
|
|
138
|
+
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
|
139
|
+
void KERNEL_4x8(int64_t ii, int64_t jj);
|
|
140
|
+
void KERNEL_8x4(int64_t ii, int64_t jj);
|
|
141
|
+
void KERNEL_8x8(int64_t ii, int64_t jj);
|
|
142
|
+
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN);
|
|
143
|
+
template <int RM, int RN>
|
|
144
|
+
void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
|
145
|
+
|
|
146
|
+
void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
|
|
147
|
+
for (int I = 0; I<8; I++) {
|
|
148
|
+
float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);
|
|
149
|
+
for (int J = 0; J<4; J++) {
|
|
150
|
+
*((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
|
|
151
|
+
*((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
inline void process_q8_elements(const int8_t *qs, int *ca) {
|
|
157
|
+
vector signed char c1 = vec_xl(0, qs);
|
|
158
|
+
vector signed char c2 = vec_xl(16, qs);
|
|
159
|
+
vector signed int vsum1 = {0};
|
|
160
|
+
vector signed int vsum2 = {0};
|
|
161
|
+
vsum1 = vec_sum4s(c1, vsum1);
|
|
162
|
+
vsum2 = vec_sum4s(c2, vsum2);
|
|
163
|
+
vector signed int vsum = vec_add(vsum1, vsum2);
|
|
164
|
+
*ca = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
template<typename VA, typename VB>
|
|
168
|
+
void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip, int* comparray=nullptr) {
|
|
169
|
+
int64_t i, j;
|
|
170
|
+
block_q8_0 *aoffset = NULL;
|
|
171
|
+
VA *vecOffset = NULL;
|
|
172
|
+
block_q8_0* aoffsets[8];
|
|
173
|
+
__vector_pair arr[8];
|
|
174
|
+
VB c[8][2] = {0};
|
|
175
|
+
VB c1[8] = {0}; VB c2[8] = {0};
|
|
176
|
+
aoffset = const_cast<block_q8_0*>(a);
|
|
177
|
+
vecOffset = vec;
|
|
178
|
+
j = (rows >> 3);
|
|
179
|
+
int index = 0;
|
|
180
|
+
if (j > 0) {
|
|
181
|
+
do {
|
|
182
|
+
for (int it = 0; it < 8; it++)
|
|
183
|
+
aoffsets[it] = aoffset + it*lda;
|
|
184
|
+
aoffset += 8 * lda;
|
|
185
|
+
for (int blk = 0; blk < kc; blk++) {
|
|
186
|
+
for (int it = 0; it < 8; it++) {
|
|
187
|
+
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
|
|
188
|
+
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
|
|
189
|
+
c1[it] = c[it][0];
|
|
190
|
+
c2[it] = c[it][1];
|
|
191
|
+
if (comparray){
|
|
192
|
+
process_q8_elements((aoffsets[it]+ blk)->qs, &comparray[index + 8*blk + it]);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
|
196
|
+
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
|
197
|
+
vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
|
|
198
|
+
vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
|
|
199
|
+
vecOffset += 256;
|
|
200
|
+
}
|
|
201
|
+
j--;
|
|
202
|
+
index += 8*kc;
|
|
203
|
+
} while(j > 0);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
|
|
209
|
+
int64_t i, j;
|
|
210
|
+
TA *aoffset = NULL;
|
|
211
|
+
int8_t *vecOffset = NULL;
|
|
212
|
+
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
|
213
|
+
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
|
214
|
+
vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
|
215
|
+
vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
|
216
|
+
aoffset = const_cast<TA*>(a);
|
|
217
|
+
vecOffset = vec;
|
|
218
|
+
int index = 0;
|
|
219
|
+
j = (rows >> 3);
|
|
220
|
+
if (j > 0) {
|
|
221
|
+
do {
|
|
222
|
+
aoffset1 = aoffset;
|
|
223
|
+
aoffset2 = aoffset1 + lda;
|
|
224
|
+
aoffset3 = aoffset2 + lda;
|
|
225
|
+
aoffset4 = aoffset3 + lda;
|
|
226
|
+
aoffset5 = aoffset4 + lda;
|
|
227
|
+
aoffset6 = aoffset5 + lda;
|
|
228
|
+
aoffset7 = aoffset6 + lda;
|
|
229
|
+
aoffset8 = aoffset7 + lda;
|
|
230
|
+
aoffset += 8 * lda;
|
|
231
|
+
for (int blk = 0; blk < kc; blk++) {
|
|
232
|
+
c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
|
|
233
|
+
c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
|
|
234
|
+
c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
|
|
235
|
+
c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
|
|
236
|
+
c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
|
|
237
|
+
c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
|
|
238
|
+
c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
|
|
239
|
+
c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
|
|
240
|
+
|
|
241
|
+
process_q4_elements(c1, &comparray[index + 8*blk+0]);
|
|
242
|
+
process_q4_elements(c2, &comparray[index + 8*blk+1]);
|
|
243
|
+
process_q4_elements(c3, &comparray[index + 8*blk+2]);
|
|
244
|
+
process_q4_elements(c4, &comparray[index + 8*blk+3]);
|
|
245
|
+
process_q4_elements(c5, &comparray[index + 8*blk+4]);
|
|
246
|
+
process_q4_elements(c6, &comparray[index + 8*blk+5]);
|
|
247
|
+
process_q4_elements(c7, &comparray[index + 8*blk+6]);
|
|
248
|
+
process_q4_elements(c8, &comparray[index + 8*blk+7]);
|
|
249
|
+
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
|
250
|
+
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
|
251
|
+
vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
|
|
252
|
+
vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
|
|
253
|
+
vecOffset += 256;
|
|
254
|
+
}
|
|
255
|
+
j--;
|
|
256
|
+
index += 8*kc;
|
|
257
|
+
} while (j > 0);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
|
|
262
|
+
acc_t acc[8];
|
|
263
|
+
for (int i = 0; i < mc ; i += 8) {
|
|
264
|
+
for (int j = 0; j < nc; j += 8) {
|
|
265
|
+
vector float fin_res[16] = {0};
|
|
266
|
+
vector float vs[16] = {0};
|
|
267
|
+
for (int64_t kk = 0; kk < kc; kk+=2) {
|
|
268
|
+
for (int x = 0; x < 8; x++) {
|
|
269
|
+
__builtin_mma_xxsetaccz(&acc[x]);
|
|
270
|
+
}
|
|
271
|
+
int A_block_idx = (i/8)*(16*kc) + kk*16;
|
|
272
|
+
int B_block_idx = (j/8)*(16*kc)+ kk*16;
|
|
273
|
+
vec_t *A_block = &vec_A[A_block_idx];
|
|
274
|
+
vec_t *B_block = &vec_B[B_block_idx];
|
|
275
|
+
for (int x = 0; x < 8; x++) {
|
|
276
|
+
__builtin_mma_xvi8ger4pp(&acc[0], A_block[x], B_block[x]);
|
|
277
|
+
__builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
|
|
278
|
+
__builtin_mma_xvi8ger4pp(&acc[2], A_block[x], B_block[x+8]);
|
|
279
|
+
__builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8], B_block[x+8]);
|
|
280
|
+
}
|
|
281
|
+
compute_scale(ii+i, jj+j, l+kk, vs);
|
|
282
|
+
int c_index = (i/8)*(8*kc)+ kk*8;
|
|
283
|
+
int* c_block = &comparray[c_index];
|
|
284
|
+
compute(&acc[0], 0, 0, c_block, vs, fin_res);
|
|
285
|
+
compute(&acc[1], 4, 4, c_block, vs, fin_res);
|
|
286
|
+
compute(&acc[2], 0, 8, c_block, vs, fin_res);
|
|
287
|
+
compute(&acc[3], 4, 12, c_block, vs, fin_res);
|
|
288
|
+
|
|
289
|
+
A_block_idx = (i/8)*(16*kc) + (kk+1)*16;
|
|
290
|
+
B_block_idx = (j/8)*(16*kc)+ (kk+1)*16;
|
|
291
|
+
A_block = &vec_A[A_block_idx];
|
|
292
|
+
B_block = &vec_B[B_block_idx];
|
|
293
|
+
for (int x = 0; x < 8; x++) {
|
|
294
|
+
__builtin_mma_xvi8ger4pp(&acc[4], A_block[x], B_block[x]);
|
|
295
|
+
__builtin_mma_xvi8ger4pp(&acc[5], A_block[x + 8], B_block[x]);
|
|
296
|
+
__builtin_mma_xvi8ger4pp(&acc[6], A_block[x], B_block[x+8]);
|
|
297
|
+
__builtin_mma_xvi8ger4pp(&acc[7], A_block[x+8], B_block[x+8]);
|
|
298
|
+
}
|
|
299
|
+
compute_scale(ii+i, jj+j, l+kk+1, vs);
|
|
300
|
+
c_index = (i/8)*(8*kc)+ (kk+1)*8;
|
|
301
|
+
c_block = &comparray[c_index];
|
|
302
|
+
compute(&acc[4], 0, 0, c_block, vs, fin_res);
|
|
303
|
+
compute(&acc[5], 4, 4, c_block, vs, fin_res);
|
|
304
|
+
compute(&acc[6], 0, 8, c_block, vs, fin_res);
|
|
305
|
+
compute(&acc[7], 4, 12, c_block, vs, fin_res);
|
|
306
|
+
|
|
307
|
+
}
|
|
308
|
+
if (l == 0) {
|
|
309
|
+
save_res(ii+i, jj+j, 0, fin_res);
|
|
310
|
+
save_res(ii+i+4, jj+j, 4, fin_res);
|
|
311
|
+
save_res(ii+i, jj+j+4, 8, fin_res);
|
|
312
|
+
save_res(ii+i+4, jj+j+4, 12, fin_res);
|
|
313
|
+
} else {
|
|
314
|
+
add_save_res(ii+i, jj+j, 0, fin_res);
|
|
315
|
+
add_save_res(ii+i+4, jj+j, 4, fin_res);
|
|
316
|
+
add_save_res(ii+i, jj+j+4, 8, fin_res);
|
|
317
|
+
add_save_res(ii+i+4, jj+j+4, 12, fin_res);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const TA *const A;
|
|
324
|
+
const block_q8_0 *const B;
|
|
325
|
+
float *C;
|
|
326
|
+
const int64_t k;
|
|
327
|
+
int64_t kc;
|
|
328
|
+
const int64_t lda;
|
|
329
|
+
const int64_t ldb;
|
|
330
|
+
const int64_t ldc;
|
|
331
|
+
const int ith;
|
|
332
|
+
const int nth;
|
|
333
|
+
};
|
|
@@ -117,8 +117,7 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
|
|
117
117
|
#endif
|
|
118
118
|
|
|
119
119
|
#if defined(__MMA__)
|
|
120
|
-
|
|
121
|
-
typedef __vector_quad acc_t;
|
|
120
|
+
#include "sgemm-ppc.h"
|
|
122
121
|
#endif
|
|
123
122
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
124
123
|
// VECTORIZED FUSED MULTIPLY ADD
|
|
@@ -1573,95 +1572,35 @@ class tinyBLAS_BF16_PPC {
|
|
|
1573
1572
|
const int nth;
|
|
1574
1573
|
};
|
|
1575
1574
|
|
|
1576
|
-
template <typename TA>
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
float *C, int64_t ldc,
|
|
1583
|
-
int ith, int nth)
|
|
1575
|
+
template <typename TA>
|
|
1576
|
+
tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
|
|
1577
|
+
const TA *A, int64_t lda,
|
|
1578
|
+
const block_q8_0 *B, int64_t ldb,
|
|
1579
|
+
float *C, int64_t ldc,
|
|
1580
|
+
int ith, int nth)
|
|
1584
1581
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
1582
|
+
kc = 64;
|
|
1585
1583
|
}
|
|
1586
1584
|
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
template<int size>
|
|
1602
|
-
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
|
1603
|
-
vector signed int vec_C[4];
|
|
1604
|
-
vector float CA[4] = {0};
|
|
1605
|
-
vector float res[4] = {0};
|
|
1606
|
-
__builtin_mma_disassemble_acc(vec_C, ACC);
|
|
1607
|
-
for (int i = 0; i < 4; i++) {
|
|
1608
|
-
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
|
1609
|
-
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
|
1610
|
-
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
|
1611
|
-
}
|
|
1612
|
-
}
|
|
1613
|
-
/* This function processes quantized data from block_q4_0 elements.
|
|
1614
|
-
* First the we try to extract the two int4 values stored in single int8_t into two signed int8.
|
|
1615
|
-
* And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
|
|
1616
|
-
* Also compute the rowsum which is required to compensate the above conversion. */
|
|
1617
|
-
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
|
1618
|
-
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
1619
|
-
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
1620
|
-
const vector signed char v8 = vec_splats((signed char)0x8);
|
|
1621
|
-
vector signed int vsum = {0};
|
|
1622
|
-
vector signed int vsum2 = {0};
|
|
1623
|
-
c[0] = vec_and(c[1], lowMask);
|
|
1624
|
-
c[1] = vec_sr(c[1], v4);
|
|
1625
|
-
c[0] = vec_sub(c[0], v8);
|
|
1626
|
-
c[1] = vec_sub(c[1], v8);
|
|
1627
|
-
vsum = vec_sum4s(c[0], vsum);
|
|
1628
|
-
vsum2 = vec_sum4s(c[1], vsum2);
|
|
1629
|
-
vsum = vec_add(vsum, vsum2);
|
|
1630
|
-
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
|
1631
|
-
}
|
|
1632
|
-
|
|
1633
|
-
template <typename V1, typename V2>
|
|
1634
|
-
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
|
1635
|
-
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
|
1636
|
-
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
1637
|
-
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
|
1638
|
-
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
|
1639
|
-
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
|
1640
|
-
vector unsigned char xor_vector;
|
|
1641
|
-
uint8_t flip_vec = 0x80;
|
|
1642
|
-
xor_vector = vec_splats(flip_vec);
|
|
1643
|
-
t1 = vec_perm(s1, s2, swiz1);
|
|
1644
|
-
t2 = vec_perm(s1, s2, swiz2);
|
|
1645
|
-
t3 = vec_perm(s3, s4, swiz1);
|
|
1646
|
-
t4 = vec_perm(s3, s4, swiz2);
|
|
1647
|
-
t5 = vec_perm(t1, t3, swiz3);
|
|
1648
|
-
t6 = vec_perm(t1, t3, swiz4);
|
|
1649
|
-
t7 = vec_perm(t2, t4, swiz3);
|
|
1650
|
-
t8 = vec_perm(t2, t4, swiz4);
|
|
1651
|
-
if (flip == true) {
|
|
1652
|
-
t5 = vec_xor(t5, xor_vector);
|
|
1653
|
-
t6 = vec_xor(t6, xor_vector);
|
|
1654
|
-
t7 = vec_xor(t7, xor_vector);
|
|
1655
|
-
t8 = vec_xor(t8, xor_vector);
|
|
1585
|
+
template<typename TA>
|
|
1586
|
+
void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
|
|
1587
|
+
int mc = 64; int nc = 64;
|
|
1588
|
+
if (n % 8 == 0 && n < nc) {
|
|
1589
|
+
nc = n;
|
|
1590
|
+
mc = 32 ;
|
|
1591
|
+
kc = 32;
|
|
1592
|
+
}
|
|
1593
|
+
const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
|
|
1594
|
+
if (is_aligned) {
|
|
1595
|
+
this->matmul_tiled_q0(m, n, mc, nc, kc);
|
|
1596
|
+
} else {
|
|
1597
|
+
mnpack(0, m, 0, n);
|
|
1656
1598
|
}
|
|
1657
|
-
vec_xst(t5, 0, vecOffset);
|
|
1658
|
-
vec_xst(t6, 0, vecOffset+16);
|
|
1659
|
-
vec_xst(t7, 0, vecOffset+32);
|
|
1660
|
-
vec_xst(t8, 0, vecOffset+48);
|
|
1661
1599
|
}
|
|
1662
1600
|
|
|
1663
|
-
|
|
1664
|
-
|
|
1601
|
+
template<typename TA>
|
|
1602
|
+
template<int size>
|
|
1603
|
+
void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
|
|
1665
1604
|
int64_t i, j;
|
|
1666
1605
|
TA *aoffset = NULL;
|
|
1667
1606
|
int8_t *vecOffset = NULL;
|
|
@@ -1781,8 +1720,10 @@ class tinyBLAS_Q0_PPC {
|
|
|
1781
1720
|
}
|
|
1782
1721
|
}
|
|
1783
1722
|
}
|
|
1723
|
+
|
|
1724
|
+
template<typename TA>
|
|
1784
1725
|
template<typename VA, typename VB>
|
|
1785
|
-
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
|
1726
|
+
void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
|
1786
1727
|
int64_t i, j;
|
|
1787
1728
|
block_q8_0 *aoffset = NULL;
|
|
1788
1729
|
VA *vecOffset = NULL;
|
|
@@ -1822,7 +1763,6 @@ class tinyBLAS_Q0_PPC {
|
|
|
1822
1763
|
j--;
|
|
1823
1764
|
} while(j > 0);
|
|
1824
1765
|
}
|
|
1825
|
-
|
|
1826
1766
|
if (rows & 4) {
|
|
1827
1767
|
aoffsets[0] = aoffset;
|
|
1828
1768
|
for (int it = 1; it < 4; it++ )
|
|
@@ -1878,7 +1818,8 @@ class tinyBLAS_Q0_PPC {
|
|
|
1878
1818
|
}
|
|
1879
1819
|
}
|
|
1880
1820
|
|
|
1881
|
-
|
|
1821
|
+
template<typename TA>
|
|
1822
|
+
void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
1882
1823
|
int m_rem = MIN(m - m0, 16);
|
|
1883
1824
|
int n_rem = MIN(n - n0, 16);
|
|
1884
1825
|
|
|
@@ -1915,7 +1856,8 @@ class tinyBLAS_Q0_PPC {
|
|
|
1915
1856
|
}
|
|
1916
1857
|
|
|
1917
1858
|
|
|
1918
|
-
|
|
1859
|
+
template<typename TA>
|
|
1860
|
+
void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
|
|
1919
1861
|
vec_t vec_A[8], vec_B[16] = {0};
|
|
1920
1862
|
acc_t acc_0, acc_1;
|
|
1921
1863
|
std::array<int, 4> comparray {};
|
|
@@ -1953,14 +1895,15 @@ class tinyBLAS_Q0_PPC {
|
|
|
1953
1895
|
aoffset += lda;
|
|
1954
1896
|
}
|
|
1955
1897
|
}
|
|
1956
|
-
compute
|
|
1957
|
-
compute
|
|
1898
|
+
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
|
1899
|
+
compute(&acc_1, 0, 4, comparray, vs, fin_res);
|
|
1958
1900
|
}
|
|
1959
1901
|
save_res(ii, jj, 0, fin_res);
|
|
1960
1902
|
save_res(ii, jj+4, 4, fin_res);
|
|
1961
1903
|
}
|
|
1962
1904
|
|
|
1963
|
-
|
|
1905
|
+
template<typename TA>
|
|
1906
|
+
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
|
|
1964
1907
|
vec_t vec_A[16], vec_B[8] = {0};
|
|
1965
1908
|
acc_t acc_0, acc_1;
|
|
1966
1909
|
std::array<int, 8> comparray {};
|
|
@@ -1997,16 +1940,18 @@ class tinyBLAS_Q0_PPC {
|
|
|
1997
1940
|
aoffset += lda;
|
|
1998
1941
|
}
|
|
1999
1942
|
}
|
|
2000
|
-
compute
|
|
2001
|
-
compute
|
|
1943
|
+
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
|
1944
|
+
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
|
2002
1945
|
}
|
|
2003
1946
|
save_res(ii, jj, 0, fin_res);
|
|
2004
1947
|
save_res(ii+4, jj, 4, fin_res);
|
|
2005
1948
|
}
|
|
2006
1949
|
|
|
2007
|
-
|
|
1950
|
+
template<typename TA>
|
|
1951
|
+
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
|
|
2008
1952
|
vec_t vec_A[16], vec_B[16] = {0};
|
|
2009
1953
|
acc_t acc_0, acc_1, acc_2, acc_3;
|
|
1954
|
+
acc_t acc_4, acc_5, acc_6, acc_7;
|
|
2010
1955
|
std::array<int, 8> comparray {};
|
|
2011
1956
|
vector float fin_res[16] = {0};
|
|
2012
1957
|
vector float vs[16] = {0};
|
|
@@ -2046,10 +1991,10 @@ class tinyBLAS_Q0_PPC {
|
|
|
2046
1991
|
aoffset += lda;
|
|
2047
1992
|
}
|
|
2048
1993
|
}
|
|
2049
|
-
compute
|
|
2050
|
-
compute
|
|
2051
|
-
compute
|
|
2052
|
-
compute
|
|
1994
|
+
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
|
1995
|
+
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
|
1996
|
+
compute(&acc_2, 0, 8, comparray, vs, fin_res);
|
|
1997
|
+
compute(&acc_3, 4, 12, comparray, vs, fin_res);
|
|
2053
1998
|
}
|
|
2054
1999
|
save_res(ii, jj, 0, fin_res);
|
|
2055
2000
|
save_res(ii+4, jj, 4, fin_res);
|
|
@@ -2057,7 +2002,8 @@ class tinyBLAS_Q0_PPC {
|
|
|
2057
2002
|
save_res(ii+4, jj+4, 12, fin_res);
|
|
2058
2003
|
}
|
|
2059
2004
|
|
|
2060
|
-
|
|
2005
|
+
template<typename TA>
|
|
2006
|
+
void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
|
2061
2007
|
int64_t ytiles = (m - m0) / RM;
|
|
2062
2008
|
int64_t xtiles = (n - n0) / RN;
|
|
2063
2009
|
int64_t tiles = xtiles * ytiles;
|
|
@@ -2125,21 +2071,9 @@ class tinyBLAS_Q0_PPC {
|
|
|
2125
2071
|
}
|
|
2126
2072
|
}
|
|
2127
2073
|
|
|
2128
|
-
template<
|
|
2129
|
-
inline void kernel(int64_t ii, int64_t jj) {
|
|
2130
|
-
if constexpr(RM == 4 && RN == 8) {
|
|
2131
|
-
KERNEL_4x8(ii,jj);
|
|
2132
|
-
} else if constexpr(RM == 8 && RN == 4) {
|
|
2133
|
-
KERNEL_8x4(ii,jj);
|
|
2134
|
-
} else if constexpr(RM == 8 && RN == 8) {
|
|
2135
|
-
KERNEL_8x8(ii,jj);
|
|
2136
|
-
} else {
|
|
2137
|
-
assert(false && "RN/RM values not supported");
|
|
2138
|
-
}
|
|
2139
|
-
}
|
|
2140
|
-
|
|
2074
|
+
template<typename TA>
|
|
2141
2075
|
template <int RM, int RN>
|
|
2142
|
-
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
2076
|
+
NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
2143
2077
|
int64_t ytiles = (m - m0) / RM;
|
|
2144
2078
|
int64_t xtiles = (n - n0) / RN;
|
|
2145
2079
|
int64_t tiles = xtiles * ytiles;
|
|
@@ -2151,20 +2085,12 @@ class tinyBLAS_Q0_PPC {
|
|
|
2151
2085
|
for (int64_t job = start; job < end; ++job) {
|
|
2152
2086
|
int64_t ii = m0 + job / xtiles * RM;
|
|
2153
2087
|
int64_t jj = n0 + job % xtiles * RN;
|
|
2154
|
-
kernel<RM, RN>(ii, jj);
|
|
2088
|
+
this->kernel<RM, RN>(ii, jj);
|
|
2155
2089
|
}
|
|
2156
2090
|
}
|
|
2157
2091
|
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
float *C;
|
|
2161
|
-
const int64_t k;
|
|
2162
|
-
const int64_t lda;
|
|
2163
|
-
const int64_t ldb;
|
|
2164
|
-
const int64_t ldc;
|
|
2165
|
-
const int ith;
|
|
2166
|
-
const int nth;
|
|
2167
|
-
};
|
|
2092
|
+
template class tinyBLAS_Q0_PPC<block_q4_0>;
|
|
2093
|
+
template class tinyBLAS_Q0_PPC<block_q8_0>;
|
|
2168
2094
|
|
|
2169
2095
|
class tinyBLAS_PPC {
|
|
2170
2096
|
public:
|