@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -116,6 +116,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
116
116
|
//===================================== Dot products =================================
|
|
117
117
|
|
|
118
118
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
119
|
+
#if defined(__riscv_v)
|
|
119
120
|
const int qk = QK8_0;
|
|
120
121
|
const int nb = n / qk;
|
|
121
122
|
|
|
@@ -132,7 +133,6 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
132
133
|
int ib = 0;
|
|
133
134
|
float sumf = 0;
|
|
134
135
|
|
|
135
|
-
#if defined(__riscv_v)
|
|
136
136
|
size_t vl = qk / 2;
|
|
137
137
|
|
|
138
138
|
for (; ib < nb; ++ib) {
|
|
@@ -164,27 +164,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
164
164
|
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
165
165
|
}
|
|
166
166
|
|
|
167
|
-
#endif
|
|
168
|
-
for (; ib < nb; ++ib) {
|
|
169
|
-
int sumi0 = 0;
|
|
170
|
-
int sumi1 = 0;
|
|
171
|
-
|
|
172
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
173
|
-
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
|
174
|
-
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
|
175
|
-
|
|
176
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
177
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
int sumi = sumi0 + sumi1;
|
|
181
|
-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
182
|
-
}
|
|
183
|
-
|
|
184
167
|
*s = sumf;
|
|
168
|
+
#else
|
|
169
|
+
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
170
|
+
#endif
|
|
185
171
|
}
|
|
186
172
|
|
|
187
173
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
174
|
+
#if defined(__riscv_v)
|
|
188
175
|
const int qk = QK8_1;
|
|
189
176
|
const int nb = n / qk;
|
|
190
177
|
|
|
@@ -201,7 +188,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
201
188
|
int ib = 0;
|
|
202
189
|
float sumf = 0;
|
|
203
190
|
|
|
204
|
-
#if defined(__riscv_v)
|
|
205
191
|
size_t vl = qk / 2;
|
|
206
192
|
|
|
207
193
|
for (; ib < nb; ++ib) {
|
|
@@ -229,27 +215,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
229
215
|
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
230
216
|
}
|
|
231
217
|
|
|
232
|
-
#endif
|
|
233
|
-
for (; ib < nb; ++ib) {
|
|
234
|
-
int sumi0 = 0;
|
|
235
|
-
int sumi1 = 0;
|
|
236
|
-
|
|
237
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
238
|
-
const int v0 = (x[ib].qs[j] & 0x0F);
|
|
239
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
|
240
|
-
|
|
241
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
242
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
int sumi = sumi0 + sumi1;
|
|
246
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
247
|
-
}
|
|
248
|
-
|
|
249
218
|
*s = sumf;
|
|
219
|
+
#else
|
|
220
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
221
|
+
#endif
|
|
250
222
|
}
|
|
251
223
|
|
|
252
224
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
225
|
+
#if defined(__riscv_v)
|
|
253
226
|
const int qk = QK8_0;
|
|
254
227
|
const int nb = n / qk;
|
|
255
228
|
|
|
@@ -267,7 +240,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
267
240
|
const block_q5_0 * GGML_RESTRICT x = vx;
|
|
268
241
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
269
242
|
|
|
270
|
-
#if defined(__riscv_v)
|
|
271
243
|
size_t vl;
|
|
272
244
|
size_t vlenb = __riscv_vlenb();
|
|
273
245
|
|
|
@@ -297,33 +269,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
297
269
|
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
298
270
|
}
|
|
299
271
|
|
|
300
|
-
#endif
|
|
301
|
-
for (; ib < nb; ++ib) {
|
|
302
|
-
uint32_t qh;
|
|
303
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
304
|
-
|
|
305
|
-
int sumi0 = 0;
|
|
306
|
-
int sumi1 = 0;
|
|
307
|
-
|
|
308
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
309
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
310
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
311
|
-
|
|
312
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
313
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
314
|
-
|
|
315
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
316
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
int sumi = sumi0 + sumi1;
|
|
320
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
321
|
-
}
|
|
322
|
-
|
|
323
272
|
*s = sumf;
|
|
273
|
+
#else
|
|
274
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
275
|
+
#endif
|
|
324
276
|
}
|
|
325
277
|
|
|
326
278
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
279
|
+
#if defined(__riscv_v)
|
|
327
280
|
const int qk = QK8_1;
|
|
328
281
|
const int nb = n / qk;
|
|
329
282
|
|
|
@@ -341,7 +294,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
341
294
|
const block_q5_1 * GGML_RESTRICT x = vx;
|
|
342
295
|
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
343
296
|
|
|
344
|
-
#if defined(__riscv_v)
|
|
345
297
|
size_t vl;
|
|
346
298
|
size_t vlenb = __riscv_vlenb();
|
|
347
299
|
|
|
@@ -370,30 +322,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
370
322
|
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
371
323
|
}
|
|
372
324
|
|
|
373
|
-
#endif
|
|
374
|
-
for (; ib < nb; ++ib) {
|
|
375
|
-
uint32_t qh;
|
|
376
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
377
|
-
|
|
378
|
-
int sumi0 = 0;
|
|
379
|
-
int sumi1 = 0;
|
|
380
|
-
|
|
381
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
382
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
383
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
384
|
-
|
|
385
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
386
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
387
|
-
|
|
388
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
389
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
int sumi = sumi0 + sumi1;
|
|
393
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
394
|
-
}
|
|
395
|
-
|
|
396
325
|
*s = sumf;
|
|
326
|
+
#else
|
|
327
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
328
|
+
#endif
|
|
397
329
|
}
|
|
398
330
|
|
|
399
331
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -431,18 +363,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
431
363
|
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
432
364
|
}
|
|
433
365
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
int sumi = 0;
|
|
366
|
+
*s = sumf;
|
|
367
|
+
#else
|
|
437
368
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
369
|
+
UNUSED(nb);
|
|
370
|
+
UNUSED(x);
|
|
371
|
+
UNUSED(y);
|
|
372
|
+
UNUSED(ib);
|
|
373
|
+
UNUSED(sumf);
|
|
441
374
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
*s = sumf;
|
|
375
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
376
|
+
#endif
|
|
446
377
|
}
|
|
447
378
|
|
|
448
379
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -738,44 +669,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
738
669
|
|
|
739
670
|
#else
|
|
740
671
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
const uint8_t * q2 = x[i].qs;
|
|
746
|
-
const int8_t * q8 = y[i].qs;
|
|
747
|
-
const uint8_t * sc = x[i].scales;
|
|
748
|
-
|
|
749
|
-
int summs = 0;
|
|
750
|
-
for (int j = 0; j < 16; ++j) {
|
|
751
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
752
|
-
}
|
|
753
|
-
|
|
754
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
755
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
672
|
+
UNUSED(x);
|
|
673
|
+
UNUSED(y);
|
|
674
|
+
UNUSED(nb);
|
|
756
675
|
|
|
757
|
-
|
|
758
|
-
int is = 0;
|
|
759
|
-
int d;
|
|
760
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
761
|
-
int shift = 0;
|
|
762
|
-
for (int j = 0; j < 4; ++j) {
|
|
763
|
-
d = sc[is++] & 0xF;
|
|
764
|
-
int isuml = 0;
|
|
765
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
766
|
-
isum += d * isuml;
|
|
767
|
-
d = sc[is++] & 0xF;
|
|
768
|
-
isuml = 0;
|
|
769
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
770
|
-
isum += d * isuml;
|
|
771
|
-
shift += 2;
|
|
772
|
-
q8 += 32;
|
|
773
|
-
}
|
|
774
|
-
q2 += 32;
|
|
775
|
-
}
|
|
776
|
-
sumf += dall * isum - dmin * summs;
|
|
777
|
-
}
|
|
778
|
-
*s = sumf;
|
|
676
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
779
677
|
#endif
|
|
780
678
|
}
|
|
781
679
|
|
|
@@ -1147,68 +1045,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1147
1045
|
*s = sumf;
|
|
1148
1046
|
|
|
1149
1047
|
#else
|
|
1150
|
-
// scalar version
|
|
1151
|
-
// This function is written like this so the compiler can manage to vectorize most of it
|
|
1152
|
-
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
|
1153
|
-
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
|
1154
|
-
// The ideal situation would be if we could just write the code once, and the compiler would
|
|
1155
|
-
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
|
1156
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
1157
|
-
|
|
1158
|
-
int8_t aux8[QK_K];
|
|
1159
|
-
int16_t aux16[8];
|
|
1160
|
-
float sums [8];
|
|
1161
|
-
int32_t aux32[8];
|
|
1162
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1163
|
-
|
|
1164
|
-
uint32_t auxs[4];
|
|
1165
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
1166
1048
|
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1173
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1174
|
-
uint8_t m = 1;
|
|
1175
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1176
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
1177
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1178
|
-
a += 32; m <<= 1;
|
|
1179
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
1180
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1181
|
-
a += 32; m <<= 1;
|
|
1182
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
1183
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1184
|
-
a += 32; m <<= 1;
|
|
1185
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
1186
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1187
|
-
a += 32; m <<= 1;
|
|
1188
|
-
q3 += 32;
|
|
1189
|
-
}
|
|
1190
|
-
a = aux8;
|
|
1191
|
-
|
|
1192
|
-
memcpy(auxs, x[i].scales, 12);
|
|
1193
|
-
uint32_t tmp = auxs[2];
|
|
1194
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
1195
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
1196
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
1197
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
1198
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1199
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1200
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1201
|
-
q8 += 8; a += 8;
|
|
1202
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1203
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1204
|
-
q8 += 8; a += 8;
|
|
1205
|
-
}
|
|
1206
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1207
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1208
|
-
}
|
|
1209
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1210
|
-
*s = sumf;
|
|
1049
|
+
UNUSED(kmask1);
|
|
1050
|
+
UNUSED(kmask2);
|
|
1051
|
+
UNUSED(x);
|
|
1052
|
+
UNUSED(y);
|
|
1053
|
+
UNUSED(nb);
|
|
1211
1054
|
|
|
1055
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1212
1056
|
#endif
|
|
1213
1057
|
|
|
1214
1058
|
}
|
|
@@ -1534,60 +1378,15 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1534
1378
|
|
|
1535
1379
|
#else
|
|
1536
1380
|
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1381
|
+
UNUSED(x);
|
|
1382
|
+
UNUSED(y);
|
|
1383
|
+
UNUSED(kmask1);
|
|
1384
|
+
UNUSED(kmask2);
|
|
1385
|
+
UNUSED(kmask3);
|
|
1386
|
+
UNUSED(nb);
|
|
1387
|
+
UNUSED(utmp);
|
|
1545
1388
|
|
|
1546
|
-
|
|
1547
|
-
for (int i = 0; i < nb; ++i) {
|
|
1548
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1549
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1550
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1551
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1552
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1553
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1554
|
-
a += 32;
|
|
1555
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1556
|
-
a += 32; q4 += 32;
|
|
1557
|
-
}
|
|
1558
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1559
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1560
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1561
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1562
|
-
utmp[2] = uaux;
|
|
1563
|
-
utmp[0] &= kmask1;
|
|
1564
|
-
|
|
1565
|
-
int sumi = 0;
|
|
1566
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1567
|
-
a = aux8;
|
|
1568
|
-
int is = 0;
|
|
1569
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1570
|
-
int32_t scale = scales[is++];
|
|
1571
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1572
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1573
|
-
q8 += 8; a += 8;
|
|
1574
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1575
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1576
|
-
q8 += 8; a += 8;
|
|
1577
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1578
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1579
|
-
q8 += 8; a += 8;
|
|
1580
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1581
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1582
|
-
q8 += 8; a += 8;
|
|
1583
|
-
}
|
|
1584
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1585
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1586
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1587
|
-
sumf -= dmin * sumi;
|
|
1588
|
-
}
|
|
1589
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1590
|
-
*s = sumf;
|
|
1389
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1591
1390
|
#endif
|
|
1592
1391
|
}
|
|
1593
1392
|
|
|
@@ -1698,65 +1497,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1698
1497
|
|
|
1699
1498
|
#else
|
|
1700
1499
|
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1709
|
-
|
|
1710
|
-
float sumf = 0;
|
|
1711
|
-
for (int i = 0; i < nb; ++i) {
|
|
1712
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1713
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
1714
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1715
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1716
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1717
|
-
uint8_t m = 1;
|
|
1718
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1719
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1720
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1721
|
-
a += 32; m <<= 1;
|
|
1722
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1723
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1724
|
-
a += 32; m <<= 1;
|
|
1725
|
-
q4 += 32;
|
|
1726
|
-
}
|
|
1727
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1728
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1729
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1730
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1731
|
-
utmp[2] = uaux;
|
|
1732
|
-
utmp[0] &= kmask1;
|
|
1500
|
+
UNUSED(x);
|
|
1501
|
+
UNUSED(y);
|
|
1502
|
+
UNUSED(kmask1);
|
|
1503
|
+
UNUSED(kmask2);
|
|
1504
|
+
UNUSED(kmask3);
|
|
1505
|
+
UNUSED(nb);
|
|
1506
|
+
UNUSED(utmp);
|
|
1733
1507
|
|
|
1734
|
-
|
|
1735
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1736
|
-
a = aux8;
|
|
1737
|
-
int is = 0;
|
|
1738
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1739
|
-
int32_t scale = scales[is++];
|
|
1740
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1741
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1742
|
-
q8 += 8; a += 8;
|
|
1743
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1744
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1745
|
-
q8 += 8; a += 8;
|
|
1746
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1747
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1748
|
-
q8 += 8; a += 8;
|
|
1749
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1750
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1751
|
-
q8 += 8; a += 8;
|
|
1752
|
-
}
|
|
1753
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1754
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1755
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1756
|
-
sumf -= dmin * sumi;
|
|
1757
|
-
}
|
|
1758
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1759
|
-
*s = sumf;
|
|
1508
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1760
1509
|
#endif
|
|
1761
1510
|
}
|
|
1762
1511
|
|
|
@@ -2024,46 +1773,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2024
1773
|
|
|
2025
1774
|
#else
|
|
2026
1775
|
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
int32_t aux32[8];
|
|
2031
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1776
|
+
UNUSED(x);
|
|
1777
|
+
UNUSED(y);
|
|
1778
|
+
UNUSED(nb);
|
|
2032
1779
|
|
|
2033
|
-
|
|
2034
|
-
for (int i = 0; i < nb; ++i) {
|
|
2035
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
2036
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2037
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2038
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2039
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2040
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
2041
|
-
for (int l = 0; l < 32; ++l) {
|
|
2042
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
2043
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
2044
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
2045
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
2046
|
-
}
|
|
2047
|
-
a += 128;
|
|
2048
|
-
q4 += 64;
|
|
2049
|
-
qh += 32;
|
|
2050
|
-
}
|
|
2051
|
-
a = aux8;
|
|
2052
|
-
int is = 0;
|
|
2053
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
2054
|
-
int scale = x[i].scales[is++];
|
|
2055
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2056
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2057
|
-
q8 += 8; a += 8;
|
|
2058
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2059
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2060
|
-
q8 += 8; a += 8;
|
|
2061
|
-
}
|
|
2062
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2063
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2064
|
-
}
|
|
2065
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2066
|
-
*s = sumf;
|
|
1780
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2067
1781
|
#endif
|
|
2068
1782
|
}
|
|
2069
1783
|
|
|
@@ -112,31 +112,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
#endif
|
|
115
|
-
|
|
116
|
-
float sumf[8];
|
|
117
|
-
int sumi;
|
|
118
|
-
|
|
119
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
120
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
121
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
122
|
-
|
|
123
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
124
|
-
for (int l = 0; l < nb; l++) {
|
|
125
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
126
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
127
|
-
sumi = 0;
|
|
128
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
129
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
130
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
131
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
132
|
-
}
|
|
133
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
138
|
-
}
|
|
139
|
-
}
|
|
115
|
+
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
140
116
|
}
|
|
141
117
|
|
|
142
118
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -361,37 +337,6 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
361
337
|
return;
|
|
362
338
|
}
|
|
363
339
|
|
|
364
|
-
#endif
|
|
365
|
-
|
|
366
|
-
int sumi;
|
|
367
|
-
|
|
368
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
369
|
-
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
370
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
371
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
372
|
-
for (int m = 0; m < 4; m++) {
|
|
373
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
374
|
-
}
|
|
375
|
-
for (int l = 0; l < nb; l++) {
|
|
376
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
377
|
-
for (int m = 0; m < 4; m++) {
|
|
378
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
379
|
-
sumi = 0;
|
|
380
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
381
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
382
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
383
|
-
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
384
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
385
|
-
}
|
|
386
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
for (int m = 0; m < 4; m++) {
|
|
392
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
393
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
}
|
|
340
|
+
#endif
|
|
341
|
+
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
397
342
|
}
|