@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -435,30 +435,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
435
435
|
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
436
436
|
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
|
437
437
|
|
|
438
|
-
#endif
|
|
439
|
-
for (; ib < nb; ++ib) {
|
|
440
|
-
uint32_t qh;
|
|
441
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
442
|
-
|
|
443
|
-
int sumi0 = 0;
|
|
444
|
-
int sumi1 = 0;
|
|
445
|
-
|
|
446
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
447
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
448
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
449
|
-
|
|
450
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
451
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
452
|
-
|
|
453
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
454
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
int sumi = sumi0 + sumi1;
|
|
458
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
459
|
-
}
|
|
460
|
-
|
|
461
438
|
*s = sumf;
|
|
439
|
+
#else
|
|
440
|
+
UNUSED(nb);
|
|
441
|
+
UNUSED(ib);
|
|
442
|
+
UNUSED(sumf);
|
|
443
|
+
UNUSED(x);
|
|
444
|
+
UNUSED(y);
|
|
445
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
446
|
+
#endif
|
|
462
447
|
}
|
|
463
448
|
|
|
464
449
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -545,30 +530,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
545
530
|
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
546
531
|
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
|
547
532
|
|
|
548
|
-
#endif
|
|
549
|
-
for (; ib < nb; ++ib) {
|
|
550
|
-
uint32_t qh;
|
|
551
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
552
|
-
|
|
553
|
-
int sumi0 = 0;
|
|
554
|
-
int sumi1 = 0;
|
|
555
|
-
|
|
556
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
557
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
558
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
559
|
-
|
|
560
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
561
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
562
|
-
|
|
563
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
564
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
int sumi = sumi0 + sumi1;
|
|
568
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
569
|
-
}
|
|
570
|
-
|
|
571
533
|
*s = sumf;
|
|
534
|
+
#else
|
|
535
|
+
UNUSED(nb);
|
|
536
|
+
UNUSED(ib);
|
|
537
|
+
UNUSED(sumf);
|
|
538
|
+
UNUSED(x);
|
|
539
|
+
UNUSED(y);
|
|
540
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
541
|
+
#endif
|
|
572
542
|
}
|
|
573
543
|
|
|
574
544
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -628,18 +598,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
628
598
|
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
629
599
|
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
|
630
600
|
|
|
631
|
-
#endif
|
|
632
|
-
for (; ib < nb; ++ib) {
|
|
633
|
-
int sumi = 0;
|
|
634
|
-
|
|
635
|
-
for (int j = 0; j < qk; j++) {
|
|
636
|
-
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
637
|
-
}
|
|
638
|
-
|
|
639
|
-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
640
|
-
}
|
|
641
|
-
|
|
642
601
|
*s = sumf;
|
|
602
|
+
#else
|
|
603
|
+
UNUSED(nb);
|
|
604
|
+
UNUSED(x);
|
|
605
|
+
UNUSED(y);
|
|
606
|
+
UNUSED(ib);
|
|
607
|
+
UNUSED(sumf);
|
|
608
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
609
|
+
#endif
|
|
643
610
|
}
|
|
644
611
|
|
|
645
612
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -755,45 +722,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
755
722
|
*s = sumf;
|
|
756
723
|
|
|
757
724
|
#else
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
const uint8_t * q2 = x[i].qs;
|
|
764
|
-
const int8_t * q8 = y[i].qs;
|
|
765
|
-
const uint8_t * sc = x[i].scales;
|
|
766
|
-
|
|
767
|
-
int summs = 0;
|
|
768
|
-
for (int j = 0; j < 16; ++j) {
|
|
769
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
773
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
774
|
-
|
|
775
|
-
int isum = 0;
|
|
776
|
-
int is = 0;
|
|
777
|
-
int d;
|
|
778
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
779
|
-
int shift = 0;
|
|
780
|
-
for (int j = 0; j < 4; ++j) {
|
|
781
|
-
d = sc[is++] & 0xF;
|
|
782
|
-
int isuml = 0;
|
|
783
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
784
|
-
isum += d * isuml;
|
|
785
|
-
d = sc[is++] & 0xF;
|
|
786
|
-
isuml = 0;
|
|
787
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
788
|
-
isum += d * isuml;
|
|
789
|
-
shift += 2;
|
|
790
|
-
q8 += 32;
|
|
791
|
-
}
|
|
792
|
-
q2 += 32;
|
|
793
|
-
}
|
|
794
|
-
sumf += dall * isum - dmin * summs;
|
|
795
|
-
}
|
|
796
|
-
*s = sumf;
|
|
725
|
+
UNUSED(x);
|
|
726
|
+
UNUSED(y);
|
|
727
|
+
UNUSED(nb);
|
|
728
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
797
729
|
#endif
|
|
798
730
|
}
|
|
799
731
|
|
|
@@ -902,68 +834,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
902
834
|
*s = sumf;
|
|
903
835
|
|
|
904
836
|
#else
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
912
|
-
|
|
913
|
-
int8_t aux8[QK_K];
|
|
914
|
-
int16_t aux16[8];
|
|
915
|
-
float sums [8];
|
|
916
|
-
int32_t aux32[8];
|
|
917
|
-
memset(sums, 0, 8*sizeof(float));
|
|
918
|
-
|
|
919
|
-
uint32_t auxs[4];
|
|
920
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
921
|
-
|
|
922
|
-
float sumf = 0;
|
|
923
|
-
for (int i = 0; i < nb; ++i) {
|
|
924
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
925
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
926
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
927
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
928
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
929
|
-
uint8_t m = 1;
|
|
930
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
931
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
932
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
933
|
-
a += 32; m <<= 1;
|
|
934
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
935
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
936
|
-
a += 32; m <<= 1;
|
|
937
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
938
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
939
|
-
a += 32; m <<= 1;
|
|
940
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
941
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
942
|
-
a += 32; m <<= 1;
|
|
943
|
-
q3 += 32;
|
|
944
|
-
}
|
|
945
|
-
a = aux8;
|
|
946
|
-
|
|
947
|
-
memcpy(auxs, x[i].scales, 12);
|
|
948
|
-
uint32_t tmp = auxs[2];
|
|
949
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
950
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
951
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
952
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
953
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
954
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
955
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
956
|
-
q8 += 8; a += 8;
|
|
957
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
958
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
959
|
-
q8 += 8; a += 8;
|
|
960
|
-
}
|
|
961
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
962
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
963
|
-
}
|
|
964
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
965
|
-
*s = sumf;
|
|
966
|
-
|
|
837
|
+
UNUSED(kmask1);
|
|
838
|
+
UNUSED(kmask2);
|
|
839
|
+
UNUSED(x);
|
|
840
|
+
UNUSED(y);
|
|
841
|
+
UNUSED(nb);
|
|
842
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
967
843
|
#endif
|
|
968
844
|
|
|
969
845
|
}
|
|
@@ -1089,61 +965,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1089
965
|
*s = sumf;
|
|
1090
966
|
|
|
1091
967
|
#else
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1101
|
-
|
|
1102
|
-
float sumf = 0;
|
|
1103
|
-
for (int i = 0; i < nb; ++i) {
|
|
1104
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1105
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1106
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1107
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1108
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1109
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1110
|
-
a += 32;
|
|
1111
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1112
|
-
a += 32; q4 += 32;
|
|
1113
|
-
}
|
|
1114
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1115
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1116
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1117
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1118
|
-
utmp[2] = uaux;
|
|
1119
|
-
utmp[0] &= kmask1;
|
|
1120
|
-
|
|
1121
|
-
int sumi = 0;
|
|
1122
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1123
|
-
a = aux8;
|
|
1124
|
-
int is = 0;
|
|
1125
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1126
|
-
int32_t scale = scales[is++];
|
|
1127
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1128
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1129
|
-
q8 += 8; a += 8;
|
|
1130
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1131
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1132
|
-
q8 += 8; a += 8;
|
|
1133
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1134
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1135
|
-
q8 += 8; a += 8;
|
|
1136
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1137
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1138
|
-
q8 += 8; a += 8;
|
|
1139
|
-
}
|
|
1140
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1141
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1142
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1143
|
-
sumf -= dmin * sumi;
|
|
1144
|
-
}
|
|
1145
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1146
|
-
*s = sumf;
|
|
968
|
+
UNUSED(x);
|
|
969
|
+
UNUSED(y);
|
|
970
|
+
UNUSED(nb);
|
|
971
|
+
UNUSED(kmask1);
|
|
972
|
+
UNUSED(kmask2);
|
|
973
|
+
UNUSED(kmask3);
|
|
974
|
+
UNUSED(utmp);
|
|
975
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1147
976
|
#endif
|
|
1148
977
|
}
|
|
1149
978
|
|
|
@@ -1279,66 +1108,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1279
1108
|
*s = sumf;
|
|
1280
1109
|
|
|
1281
1110
|
#else
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1291
|
-
|
|
1292
|
-
float sumf = 0;
|
|
1293
|
-
for (int i = 0; i < nb; ++i) {
|
|
1294
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1295
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
1296
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1297
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1298
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1299
|
-
uint8_t m = 1;
|
|
1300
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1301
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1302
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1303
|
-
a += 32; m <<= 1;
|
|
1304
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1305
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1306
|
-
a += 32; m <<= 1;
|
|
1307
|
-
q4 += 32;
|
|
1308
|
-
}
|
|
1309
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1310
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1311
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1312
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1313
|
-
utmp[2] = uaux;
|
|
1314
|
-
utmp[0] &= kmask1;
|
|
1315
|
-
|
|
1316
|
-
int sumi = 0;
|
|
1317
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1318
|
-
a = aux8;
|
|
1319
|
-
int is = 0;
|
|
1320
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1321
|
-
int32_t scale = scales[is++];
|
|
1322
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1323
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1324
|
-
q8 += 8; a += 8;
|
|
1325
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1326
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1327
|
-
q8 += 8; a += 8;
|
|
1328
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1329
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1330
|
-
q8 += 8; a += 8;
|
|
1331
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1332
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1333
|
-
q8 += 8; a += 8;
|
|
1334
|
-
}
|
|
1335
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1336
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1337
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1338
|
-
sumf -= dmin * sumi;
|
|
1339
|
-
}
|
|
1340
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1341
|
-
*s = sumf;
|
|
1111
|
+
UNUSED(x);
|
|
1112
|
+
UNUSED(y);
|
|
1113
|
+
UNUSED(nb);
|
|
1114
|
+
UNUSED(kmask1);
|
|
1115
|
+
UNUSED(kmask2);
|
|
1116
|
+
UNUSED(kmask3);
|
|
1117
|
+
UNUSED(utmp);
|
|
1118
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1342
1119
|
#endif
|
|
1343
1120
|
}
|
|
1344
1121
|
|
|
@@ -1435,47 +1212,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1435
1212
|
*s = sumf;
|
|
1436
1213
|
|
|
1437
1214
|
#else
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
int32_t aux32[8];
|
|
1443
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1444
|
-
|
|
1445
|
-
float sumf = 0;
|
|
1446
|
-
for (int i = 0; i < nb; ++i) {
|
|
1447
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
1448
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
1449
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1450
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1451
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1452
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1453
|
-
for (int l = 0; l < 32; ++l) {
|
|
1454
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
1455
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
1456
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
1457
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
1458
|
-
}
|
|
1459
|
-
a += 128;
|
|
1460
|
-
q4 += 64;
|
|
1461
|
-
qh += 32;
|
|
1462
|
-
}
|
|
1463
|
-
a = aux8;
|
|
1464
|
-
int is = 0;
|
|
1465
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1466
|
-
int scale = x[i].scales[is++];
|
|
1467
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1468
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1469
|
-
q8 += 8; a += 8;
|
|
1470
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1471
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1472
|
-
q8 += 8; a += 8;
|
|
1473
|
-
}
|
|
1474
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1475
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1476
|
-
}
|
|
1477
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1478
|
-
*s = sumf;
|
|
1215
|
+
UNUSED(x);
|
|
1216
|
+
UNUSED(y);
|
|
1217
|
+
UNUSED(nb);
|
|
1218
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1479
1219
|
#endif
|
|
1480
1220
|
}
|
|
1481
1221
|
|