@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -201,24 +201,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
201
201
|
|
|
202
202
|
sumf = vec_extract(vsumf0, 0);
|
|
203
203
|
|
|
204
|
-
#endif
|
|
205
|
-
for (; ib < nb; ++ib) {
|
|
206
|
-
int sumi0 = 0;
|
|
207
|
-
int sumi1 = 0;
|
|
208
|
-
|
|
209
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
210
|
-
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
|
211
|
-
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
|
212
|
-
|
|
213
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
214
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
int sumi = sumi0 + sumi1;
|
|
218
|
-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
219
|
-
}
|
|
220
|
-
|
|
221
204
|
*s = sumf;
|
|
205
|
+
#else
|
|
206
|
+
UNUSED(x);
|
|
207
|
+
UNUSED(y);
|
|
208
|
+
UNUSED(ib);
|
|
209
|
+
UNUSED(sumf);
|
|
210
|
+
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
211
|
+
#endif
|
|
222
212
|
}
|
|
223
213
|
|
|
224
214
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -278,24 +268,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
278
268
|
|
|
279
269
|
sumf = vec_extract(vsumf0, 0);
|
|
280
270
|
|
|
281
|
-
#endif
|
|
282
|
-
for (; ib < nb; ++ib) {
|
|
283
|
-
int sumi0 = 0;
|
|
284
|
-
int sumi1 = 0;
|
|
285
|
-
|
|
286
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
287
|
-
const int v0 = (x[ib].qs[j] & 0x0F);
|
|
288
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
|
289
|
-
|
|
290
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
291
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
int sumi = sumi0 + sumi1;
|
|
295
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
296
|
-
}
|
|
297
|
-
|
|
298
271
|
*s = sumf;
|
|
272
|
+
#else
|
|
273
|
+
UNUSED(x);
|
|
274
|
+
UNUSED(y);
|
|
275
|
+
UNUSED(ib);
|
|
276
|
+
UNUSED(sumf);
|
|
277
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
278
|
+
#endif
|
|
299
279
|
}
|
|
300
280
|
|
|
301
281
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -360,30 +340,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
360
340
|
|
|
361
341
|
sumf = vec_extract(vsumf0, 0);
|
|
362
342
|
|
|
363
|
-
#endif
|
|
364
|
-
for (; ib < nb; ++ib) {
|
|
365
|
-
uint32_t qh;
|
|
366
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
367
|
-
|
|
368
|
-
int sumi0 = 0;
|
|
369
|
-
int sumi1 = 0;
|
|
370
|
-
|
|
371
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
372
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
373
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
374
|
-
|
|
375
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
376
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
377
|
-
|
|
378
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
379
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
int sumi = sumi0 + sumi1;
|
|
383
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
343
|
*s = sumf;
|
|
344
|
+
#else
|
|
345
|
+
UNUSED(ib);
|
|
346
|
+
UNUSED(sumf);
|
|
347
|
+
UNUSED(x);
|
|
348
|
+
UNUSED(y);
|
|
349
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
350
|
+
#endif
|
|
387
351
|
}
|
|
388
352
|
|
|
389
353
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -451,30 +415,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
451
415
|
|
|
452
416
|
sumf = vec_extract(vsumf0, 0);
|
|
453
417
|
|
|
454
|
-
#endif
|
|
455
|
-
for (; ib < nb; ++ib) {
|
|
456
|
-
uint32_t qh;
|
|
457
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
458
|
-
|
|
459
|
-
int sumi0 = 0;
|
|
460
|
-
int sumi1 = 0;
|
|
461
|
-
|
|
462
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
463
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
464
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
465
|
-
|
|
466
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
467
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
468
|
-
|
|
469
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
470
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
int sumi = sumi0 + sumi1;
|
|
474
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
475
|
-
}
|
|
476
|
-
|
|
477
418
|
*s = sumf;
|
|
419
|
+
#else
|
|
420
|
+
UNUSED(nb);
|
|
421
|
+
UNUSED(ib);
|
|
422
|
+
UNUSED(sumf);
|
|
423
|
+
UNUSED(x);
|
|
424
|
+
UNUSED(y);
|
|
425
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
426
|
+
#endif
|
|
478
427
|
}
|
|
479
428
|
|
|
480
429
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -535,18 +484,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
535
484
|
|
|
536
485
|
sumf = vec_extract(vsumf0, 0);
|
|
537
486
|
|
|
538
|
-
#endif
|
|
539
|
-
for (; ib < nb; ++ib) {
|
|
540
|
-
int sumi = 0;
|
|
541
|
-
|
|
542
|
-
for (int j = 0; j < qk; j++) {
|
|
543
|
-
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
547
|
-
}
|
|
548
|
-
|
|
549
487
|
*s = sumf;
|
|
488
|
+
#else
|
|
489
|
+
UNUSED(nb);
|
|
490
|
+
UNUSED(x);
|
|
491
|
+
UNUSED(y);
|
|
492
|
+
UNUSED(ib);
|
|
493
|
+
UNUSED(sumf);
|
|
494
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
495
|
+
#endif
|
|
550
496
|
}
|
|
551
497
|
|
|
552
498
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -695,45 +641,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
695
641
|
*s = vec_extract(vsumf0, 0);
|
|
696
642
|
|
|
697
643
|
#else
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
const uint8_t * q2 = x[i].qs;
|
|
704
|
-
const int8_t * q8 = y[i].qs;
|
|
705
|
-
const uint8_t * sc = x[i].scales;
|
|
706
|
-
|
|
707
|
-
int summs = 0;
|
|
708
|
-
for (int j = 0; j < 16; ++j) {
|
|
709
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
713
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
714
|
-
|
|
715
|
-
int isum = 0;
|
|
716
|
-
int is = 0;
|
|
717
|
-
int d;
|
|
718
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
719
|
-
int shift = 0;
|
|
720
|
-
for (int j = 0; j < 4; ++j) {
|
|
721
|
-
d = sc[is++] & 0xF;
|
|
722
|
-
int isuml = 0;
|
|
723
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
724
|
-
isum += d * isuml;
|
|
725
|
-
d = sc[is++] & 0xF;
|
|
726
|
-
isuml = 0;
|
|
727
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
728
|
-
isum += d * isuml;
|
|
729
|
-
shift += 2;
|
|
730
|
-
q8 += 32;
|
|
731
|
-
}
|
|
732
|
-
q2 += 32;
|
|
733
|
-
}
|
|
734
|
-
sumf += dall * isum - dmin * summs;
|
|
735
|
-
}
|
|
736
|
-
*s = sumf;
|
|
644
|
+
UNUSED(x);
|
|
645
|
+
UNUSED(y);
|
|
646
|
+
UNUSED(nb);
|
|
647
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
737
648
|
#endif
|
|
738
649
|
}
|
|
739
650
|
|
|
@@ -907,70 +818,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
907
818
|
*s = vec_extract(vsumf0, 0);
|
|
908
819
|
|
|
909
820
|
#else
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
917
|
-
|
|
918
|
-
int8_t aux8[QK_K];
|
|
919
|
-
int16_t aux16[8];
|
|
920
|
-
float sums [8];
|
|
921
|
-
int32_t aux32[8];
|
|
922
|
-
memset(sums, 0, 8*sizeof(float));
|
|
923
|
-
|
|
924
|
-
uint32_t auxs[4];
|
|
925
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
926
|
-
|
|
927
|
-
float sumf = 0;
|
|
928
|
-
for (int i = 0; i < nb; ++i) {
|
|
929
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
930
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
931
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
932
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
933
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
934
|
-
uint8_t m = 1;
|
|
935
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
936
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
937
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
938
|
-
a += 32; m <<= 1;
|
|
939
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
940
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
941
|
-
a += 32; m <<= 1;
|
|
942
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
943
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
944
|
-
a += 32; m <<= 1;
|
|
945
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
946
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
947
|
-
a += 32; m <<= 1;
|
|
948
|
-
q3 += 32;
|
|
949
|
-
}
|
|
950
|
-
a = aux8;
|
|
951
|
-
|
|
952
|
-
memcpy(auxs, x[i].scales, 12);
|
|
953
|
-
uint32_t tmp = auxs[2];
|
|
954
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
955
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
956
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
957
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
958
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
959
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
960
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
961
|
-
q8 += 8; a += 8;
|
|
962
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
963
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
964
|
-
q8 += 8; a += 8;
|
|
965
|
-
}
|
|
966
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
967
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
968
|
-
}
|
|
969
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
970
|
-
*s = sumf;
|
|
971
|
-
|
|
821
|
+
UNUSED(kmask1);
|
|
822
|
+
UNUSED(kmask2);
|
|
823
|
+
UNUSED(x);
|
|
824
|
+
UNUSED(y);
|
|
825
|
+
UNUSED(nb);
|
|
826
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
972
827
|
#endif
|
|
973
|
-
|
|
974
828
|
}
|
|
975
829
|
|
|
976
830
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1130,61 +984,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1130
984
|
*s = vec_extract(vsumf0, 0);
|
|
1131
985
|
|
|
1132
986
|
#else
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1142
|
-
|
|
1143
|
-
float sumf = 0;
|
|
1144
|
-
for (int i = 0; i < nb; ++i) {
|
|
1145
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1146
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1147
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1148
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1149
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1150
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1151
|
-
a += 32;
|
|
1152
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1153
|
-
a += 32; q4 += 32;
|
|
1154
|
-
}
|
|
1155
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1156
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1157
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1158
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1159
|
-
utmp[2] = uaux;
|
|
1160
|
-
utmp[0] &= kmask1;
|
|
1161
|
-
|
|
1162
|
-
int sumi = 0;
|
|
1163
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1164
|
-
a = aux8;
|
|
1165
|
-
int is = 0;
|
|
1166
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1167
|
-
int32_t scale = scales[is++];
|
|
1168
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1169
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1170
|
-
q8 += 8; a += 8;
|
|
1171
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1172
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1173
|
-
q8 += 8; a += 8;
|
|
1174
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1175
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1176
|
-
q8 += 8; a += 8;
|
|
1177
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1178
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1179
|
-
q8 += 8; a += 8;
|
|
1180
|
-
}
|
|
1181
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1182
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1183
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1184
|
-
sumf -= dmin * sumi;
|
|
1185
|
-
}
|
|
1186
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1187
|
-
*s = sumf;
|
|
987
|
+
UNUSED(x);
|
|
988
|
+
UNUSED(y);
|
|
989
|
+
UNUSED(nb);
|
|
990
|
+
UNUSED(kmask1);
|
|
991
|
+
UNUSED(kmask2);
|
|
992
|
+
UNUSED(kmask3);
|
|
993
|
+
UNUSED(utmp);
|
|
994
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1188
995
|
#endif
|
|
1189
996
|
}
|
|
1190
997
|
|
|
@@ -1342,66 +1149,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1342
1149
|
*s = vec_extract(vsumf0, 0);
|
|
1343
1150
|
|
|
1344
1151
|
#else
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1354
|
-
|
|
1355
|
-
float sumf = 0;
|
|
1356
|
-
for (int i = 0; i < nb; ++i) {
|
|
1357
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1358
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
1359
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1360
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1361
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1362
|
-
uint8_t m = 1;
|
|
1363
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1364
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1365
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1366
|
-
a += 32; m <<= 1;
|
|
1367
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1368
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1369
|
-
a += 32; m <<= 1;
|
|
1370
|
-
q4 += 32;
|
|
1371
|
-
}
|
|
1372
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1373
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1374
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1375
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1376
|
-
utmp[2] = uaux;
|
|
1377
|
-
utmp[0] &= kmask1;
|
|
1378
|
-
|
|
1379
|
-
int sumi = 0;
|
|
1380
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1381
|
-
a = aux8;
|
|
1382
|
-
int is = 0;
|
|
1383
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1384
|
-
int32_t scale = scales[is++];
|
|
1385
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1386
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1387
|
-
q8 += 8; a += 8;
|
|
1388
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1389
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1390
|
-
q8 += 8; a += 8;
|
|
1391
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1392
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1393
|
-
q8 += 8; a += 8;
|
|
1394
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1395
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1396
|
-
q8 += 8; a += 8;
|
|
1397
|
-
}
|
|
1398
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1399
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1400
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1401
|
-
sumf -= dmin * sumi;
|
|
1402
|
-
}
|
|
1403
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1404
|
-
*s = sumf;
|
|
1152
|
+
UNUSED(x);
|
|
1153
|
+
UNUSED(y);
|
|
1154
|
+
UNUSED(nb);
|
|
1155
|
+
UNUSED(kmask1);
|
|
1156
|
+
UNUSED(kmask2);
|
|
1157
|
+
UNUSED(kmask3);
|
|
1158
|
+
UNUSED(utmp);
|
|
1159
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1405
1160
|
#endif
|
|
1406
1161
|
}
|
|
1407
1162
|
|
|
@@ -1556,47 +1311,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1556
1311
|
*s = vec_extract(vsumf0, 0);
|
|
1557
1312
|
|
|
1558
1313
|
#else
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
int32_t aux32[8];
|
|
1564
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1565
|
-
|
|
1566
|
-
float sumf = 0;
|
|
1567
|
-
for (int i = 0; i < nb; ++i) {
|
|
1568
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
1569
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
1570
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1571
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1572
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1573
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1574
|
-
for (int l = 0; l < 32; ++l) {
|
|
1575
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
1576
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
1577
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
1578
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
1579
|
-
}
|
|
1580
|
-
a += 128;
|
|
1581
|
-
q4 += 64;
|
|
1582
|
-
qh += 32;
|
|
1583
|
-
}
|
|
1584
|
-
a = aux8;
|
|
1585
|
-
int is = 0;
|
|
1586
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1587
|
-
int scale = x[i].scales[is++];
|
|
1588
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1589
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1590
|
-
q8 += 8; a += 8;
|
|
1591
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1592
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1593
|
-
q8 += 8; a += 8;
|
|
1594
|
-
}
|
|
1595
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1596
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1597
|
-
}
|
|
1598
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1599
|
-
*s = sumf;
|
|
1314
|
+
UNUSED(x);
|
|
1315
|
+
UNUSED(y);
|
|
1316
|
+
UNUSED(nb);
|
|
1317
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1600
1318
|
#endif
|
|
1601
1319
|
}
|
|
1602
1320
|
|
|
@@ -1737,34 +1455,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1737
1455
|
*s = 0.125f * vec_extract(vsumf0, 0);
|
|
1738
1456
|
|
|
1739
1457
|
#else
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
float sumf = 0.f;
|
|
1745
|
-
for (int i = 0; i < nb; ++i) {
|
|
1746
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1747
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1748
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1749
|
-
int32_t bsum = 0;
|
|
1750
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
1751
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
1752
|
-
q2 += 4;
|
|
1753
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
1754
|
-
int32_t sumi = 0;
|
|
1755
|
-
for (int l = 0; l < 4; ++l) {
|
|
1756
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
1757
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
1758
|
-
for (int j = 0; j < 8; ++j) {
|
|
1759
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1760
|
-
}
|
|
1761
|
-
q8 += 8;
|
|
1762
|
-
}
|
|
1763
|
-
bsum += sumi * ls;
|
|
1764
|
-
}
|
|
1765
|
-
sumf += d * bsum;
|
|
1766
|
-
}
|
|
1767
|
-
*s = 0.125f * sumf;
|
|
1458
|
+
UNUSED(x);
|
|
1459
|
+
UNUSED(y);
|
|
1460
|
+
UNUSED(nb);
|
|
1461
|
+
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1768
1462
|
#endif
|
|
1769
1463
|
}
|
|
1770
1464
|
|
|
@@ -1869,42 +1563,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1869
1563
|
*s = 0.125f * vec_extract(vsumf0, 0);
|
|
1870
1564
|
|
|
1871
1565
|
#else
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1877
|
-
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
1878
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1879
|
-
int32_t bsum = 0;
|
|
1880
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
1881
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
1882
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
1883
|
-
int32_t sumi = 0;
|
|
1884
|
-
for (int l = 0; l < 2; ++l) {
|
|
1885
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
1886
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
1887
|
-
for (int j = 0; j < 8; ++j) {
|
|
1888
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1889
|
-
}
|
|
1890
|
-
q8 += 8;
|
|
1891
|
-
}
|
|
1892
|
-
bsum += sumi * ls1;
|
|
1893
|
-
sumi = 0;
|
|
1894
|
-
for (int l = 2; l < 4; ++l) {
|
|
1895
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
1896
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
1897
|
-
for (int j = 0; j < 8; ++j) {
|
|
1898
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1899
|
-
}
|
|
1900
|
-
q8 += 8;
|
|
1901
|
-
}
|
|
1902
|
-
bsum += sumi * ls2;
|
|
1903
|
-
q2 += 4;
|
|
1904
|
-
}
|
|
1905
|
-
sumf += d * bsum;
|
|
1906
|
-
}
|
|
1907
|
-
*s = 0.125f * sumf;
|
|
1566
|
+
UNUSED(x);
|
|
1567
|
+
UNUSED(y);
|
|
1568
|
+
UNUSED(nb);
|
|
1569
|
+
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1908
1570
|
#endif
|
|
1909
1571
|
}
|
|
1910
1572
|
|
|
@@ -2030,47 +1692,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2030
1692
|
*s = 0.125f * vec_extract(vsumf0, 0);
|
|
2031
1693
|
|
|
2032
1694
|
#else
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2038
|
-
const int8_t * q8 = y[i].qs;
|
|
2039
|
-
const uint8_t * qs = x[i].qs;
|
|
2040
|
-
const uint8_t * qh = x[i].qh;
|
|
2041
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
2042
|
-
|
|
2043
|
-
int bsum = 0;
|
|
2044
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2045
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
2046
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
2047
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2048
|
-
for (int l = 0; l < 2; ++l) {
|
|
2049
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
2050
|
-
for (int j = 0; j < 8; ++j) {
|
|
2051
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
2052
|
-
}
|
|
2053
|
-
q8 += 8;
|
|
2054
|
-
}
|
|
2055
|
-
for (int l = 2; l < 4; ++l) {
|
|
2056
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
2057
|
-
for (int j = 0; j < 8; ++j) {
|
|
2058
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
2059
|
-
}
|
|
2060
|
-
q8 += 8;
|
|
2061
|
-
}
|
|
2062
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
2063
|
-
qs += 4;
|
|
2064
|
-
signs += 4;
|
|
2065
|
-
}
|
|
2066
|
-
|
|
2067
|
-
sumf += d * bsum;
|
|
2068
|
-
}
|
|
2069
|
-
|
|
2070
|
-
*s = 0.125f * sumf;
|
|
2071
|
-
|
|
1695
|
+
UNUSED(x);
|
|
1696
|
+
UNUSED(y);
|
|
1697
|
+
UNUSED(nb);
|
|
1698
|
+
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2072
1699
|
#endif
|
|
2073
|
-
|
|
2074
1700
|
}
|
|
2075
1701
|
|
|
2076
1702
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2172,36 +1798,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2172
1798
|
*s = 0.25f * vec_extract(vsumf0, 0);
|
|
2173
1799
|
|
|
2174
1800
|
#else
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
for (int i = 0; i < nb; ++i) {
|
|
2180
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2181
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2182
|
-
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
2183
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2184
|
-
int32_t bsum = 0;
|
|
2185
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2186
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
2187
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
2188
|
-
int32_t sumi = 0;
|
|
2189
|
-
for (int l = 0; l < 4; ++l) {
|
|
2190
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
2191
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
2192
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
2193
|
-
for (int j = 0; j < 4; ++j) {
|
|
2194
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2195
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2196
|
-
}
|
|
2197
|
-
q8 += 8;
|
|
2198
|
-
}
|
|
2199
|
-
q3 += 8;
|
|
2200
|
-
bsum += sumi * ls;
|
|
2201
|
-
}
|
|
2202
|
-
sumf += d * bsum;
|
|
2203
|
-
}
|
|
2204
|
-
*s = 0.25f * sumf;
|
|
1801
|
+
UNUSED(x);
|
|
1802
|
+
UNUSED(y);
|
|
1803
|
+
UNUSED(nb);
|
|
1804
|
+
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2205
1805
|
#endif
|
|
2206
1806
|
}
|
|
2207
1807
|
|
|
@@ -2327,48 +1927,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2327
1927
|
*s = vec_extract(vsumf0, 0);
|
|
2328
1928
|
|
|
2329
1929
|
#else
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2335
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2336
|
-
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
2337
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2338
|
-
int32_t bsum = 0;
|
|
2339
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
2340
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
2341
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
2342
|
-
int32_t sumi = 0;
|
|
2343
|
-
for (int l = 0; l < 4; ++l) {
|
|
2344
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
2345
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
2346
|
-
for (int j = 0; j < 4; ++j) {
|
|
2347
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2348
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2349
|
-
}
|
|
2350
|
-
q8 += 8;
|
|
2351
|
-
}
|
|
2352
|
-
qs += 8;
|
|
2353
|
-
signs += 4;
|
|
2354
|
-
bsum += sumi * ls1;
|
|
2355
|
-
sumi = 0;
|
|
2356
|
-
for (int l = 0; l < 4; ++l) {
|
|
2357
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
2358
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
2359
|
-
for (int j = 0; j < 4; ++j) {
|
|
2360
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2361
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2362
|
-
}
|
|
2363
|
-
q8 += 8;
|
|
2364
|
-
}
|
|
2365
|
-
qs += 8;
|
|
2366
|
-
signs += 4;
|
|
2367
|
-
bsum += sumi * ls2;
|
|
2368
|
-
}
|
|
2369
|
-
sumf += d * bsum;
|
|
2370
|
-
}
|
|
2371
|
-
*s = sumf;
|
|
1930
|
+
UNUSED(x);
|
|
1931
|
+
UNUSED(y);
|
|
1932
|
+
UNUSED(nb);
|
|
1933
|
+
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2372
1934
|
#endif
|
|
2373
1935
|
}
|
|
2374
1936
|
|
|
@@ -2481,36 +2043,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2481
2043
|
*s = vec_extract(vsumf0, 0);
|
|
2482
2044
|
|
|
2483
2045
|
#else
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
const int8_t * q8 = y[i].qs;
|
|
2489
|
-
const uint8_t * qs = x[i].qs;
|
|
2490
|
-
const uint16_t * qh = x[i].qh;
|
|
2491
|
-
|
|
2492
|
-
int sumi = 0, sumi1 = 0;
|
|
2493
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
2494
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
2495
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
2496
|
-
int lsum = 0;
|
|
2497
|
-
for (int l = 0; l < 4; ++l) {
|
|
2498
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
2499
|
-
for (int j = 0; j < 8; ++j) {
|
|
2500
|
-
lsum += q8[j] * grid[j];
|
|
2501
|
-
}
|
|
2502
|
-
q8 += 8;
|
|
2503
|
-
}
|
|
2504
|
-
sumi += ls * lsum;
|
|
2505
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
2506
|
-
qs += 4;
|
|
2507
|
-
}
|
|
2508
|
-
|
|
2509
|
-
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
2510
|
-
}
|
|
2511
|
-
|
|
2512
|
-
*s = sumf;
|
|
2513
|
-
|
|
2046
|
+
UNUSED(x);
|
|
2047
|
+
UNUSED(y);
|
|
2048
|
+
UNUSED(nb);
|
|
2049
|
+
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2514
2050
|
#endif
|
|
2515
2051
|
}
|
|
2516
2052
|
|
|
@@ -2581,17 +2117,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2581
2117
|
|
|
2582
2118
|
sumf = vec_extract(vsumf0, 0);
|
|
2583
2119
|
|
|
2584
|
-
#endif
|
|
2585
|
-
for (; ib < nb; ++ib) {
|
|
2586
|
-
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
2587
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2588
|
-
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
2589
|
-
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
2590
|
-
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
|
2591
|
-
}
|
|
2592
|
-
sumf += d * (sumi1 + sumi2);
|
|
2593
|
-
}
|
|
2594
2120
|
*s = sumf;
|
|
2121
|
+
#else
|
|
2122
|
+
UNUSED(x);
|
|
2123
|
+
UNUSED(y);
|
|
2124
|
+
UNUSED(nb);
|
|
2125
|
+
UNUSED(ib);
|
|
2126
|
+
UNUSED(sumf);
|
|
2127
|
+
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2128
|
+
#endif
|
|
2595
2129
|
}
|
|
2596
2130
|
|
|
2597
2131
|
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2696,37 +2230,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2696
2230
|
*s = vec_extract(vsumf0, 0);
|
|
2697
2231
|
|
|
2698
2232
|
#else
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
const uint8_t * qs = x[ibl].qs;
|
|
2704
|
-
const int8_t * q8 = y[ibl].qs;
|
|
2705
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
2706
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
2707
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
2708
|
-
h >>= 4;
|
|
2709
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
2710
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
2711
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2712
|
-
for (int j = 0; j < 16; ++j) {
|
|
2713
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
2714
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
2715
|
-
}
|
|
2716
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
2717
|
-
qs += 16;
|
|
2718
|
-
q8 += 32;
|
|
2719
|
-
sumi1 = sumi2 = 0;
|
|
2720
|
-
for (int j = 0; j < 16; ++j) {
|
|
2721
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
2722
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
2723
|
-
}
|
|
2724
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
2725
|
-
qs += 16;
|
|
2726
|
-
q8 += 32;
|
|
2727
|
-
}
|
|
2728
|
-
}
|
|
2729
|
-
*s = sumf;
|
|
2233
|
+
UNUSED(x);
|
|
2234
|
+
UNUSED(y);
|
|
2235
|
+
UNUSED(nb);
|
|
2236
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2730
2237
|
#endif
|
|
2731
2238
|
}
|
|
2732
2239
|
|