@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -702,7 +702,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
702
702
|
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
703
703
|
|
|
704
704
|
int ib = 0;
|
|
705
|
-
float sumf = 0;
|
|
706
705
|
|
|
707
706
|
#if defined(__AVX2__) || defined(__AVX__)
|
|
708
707
|
// Initialize accumulator with zeros
|
|
@@ -737,26 +736,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
737
736
|
#endif
|
|
738
737
|
}
|
|
739
738
|
|
|
740
|
-
|
|
741
|
-
|
|
739
|
+
*s = hsum_float_8(acc) + summs;
|
|
740
|
+
#else
|
|
741
|
+
UNUSED(nb);
|
|
742
|
+
UNUSED(x);
|
|
743
|
+
UNUSED(y);
|
|
744
|
+
UNUSED(ib);
|
|
745
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
742
746
|
#endif
|
|
743
|
-
for (; ib < nb; ++ib) {
|
|
744
|
-
int sumi0 = 0;
|
|
745
|
-
int sumi1 = 0;
|
|
746
|
-
|
|
747
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
748
|
-
const int v0 = (x[ib].qs[j] & 0x0F);
|
|
749
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
|
750
|
-
|
|
751
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
752
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
753
|
-
}
|
|
754
|
-
|
|
755
|
-
int sumi = sumi0 + sumi1;
|
|
756
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
757
|
-
}
|
|
758
|
-
|
|
759
|
-
*s = sumf;
|
|
760
747
|
}
|
|
761
748
|
|
|
762
749
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -764,7 +751,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
764
751
|
const int nb = n / qk;
|
|
765
752
|
|
|
766
753
|
int ib = 0;
|
|
767
|
-
float sumf = 0;
|
|
768
754
|
|
|
769
755
|
assert(n % qk == 0);
|
|
770
756
|
assert(qk == QK5_0);
|
|
@@ -799,7 +785,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
799
785
|
acc = _mm256_fmadd_ps(d, q, acc);
|
|
800
786
|
}
|
|
801
787
|
|
|
802
|
-
|
|
788
|
+
*s = hsum_float_8(acc);
|
|
803
789
|
#elif defined(__AVX__)
|
|
804
790
|
// Initialize accumulator with zeros
|
|
805
791
|
__m256 acc = _mm256_setzero_ps();
|
|
@@ -830,32 +816,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
830
816
|
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
|
831
817
|
}
|
|
832
818
|
|
|
833
|
-
|
|
834
|
-
|
|
819
|
+
*s = hsum_float_8(acc);
|
|
820
|
+
#else
|
|
821
|
+
UNUSED(nb);
|
|
822
|
+
UNUSED(ib);
|
|
823
|
+
UNUSED(x);
|
|
824
|
+
UNUSED(y);
|
|
825
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
835
826
|
#endif
|
|
836
|
-
for (; ib < nb; ++ib) {
|
|
837
|
-
uint32_t qh;
|
|
838
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
839
|
-
|
|
840
|
-
int sumi0 = 0;
|
|
841
|
-
int sumi1 = 0;
|
|
842
|
-
|
|
843
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
844
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
845
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
846
|
-
|
|
847
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
848
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
849
|
-
|
|
850
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
851
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
852
|
-
}
|
|
853
|
-
|
|
854
|
-
int sumi = sumi0 + sumi1;
|
|
855
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
*s = sumf;
|
|
859
827
|
}
|
|
860
828
|
|
|
861
829
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -863,7 +831,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
863
831
|
const int nb = n / qk;
|
|
864
832
|
|
|
865
833
|
int ib = 0;
|
|
866
|
-
float sumf = 0;
|
|
867
834
|
|
|
868
835
|
assert(n % qk == 0);
|
|
869
836
|
assert(qk == QK5_1);
|
|
@@ -901,7 +868,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
901
868
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
|
902
869
|
}
|
|
903
870
|
|
|
904
|
-
|
|
871
|
+
*s = hsum_float_8(acc) + summs;
|
|
905
872
|
#elif defined(__AVX__)
|
|
906
873
|
// Initialize accumulator with zeros
|
|
907
874
|
__m256 acc = _mm256_setzero_ps();
|
|
@@ -935,32 +902,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
935
902
|
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
|
936
903
|
}
|
|
937
904
|
|
|
938
|
-
|
|
939
|
-
|
|
905
|
+
*s = hsum_float_8(acc) + summs;
|
|
906
|
+
#else
|
|
907
|
+
UNUSED(nb);
|
|
908
|
+
UNUSED(ib);
|
|
909
|
+
UNUSED(x);
|
|
910
|
+
UNUSED(y);
|
|
911
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
940
912
|
#endif
|
|
941
|
-
for (; ib < nb; ++ib) {
|
|
942
|
-
uint32_t qh;
|
|
943
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
944
|
-
|
|
945
|
-
int sumi0 = 0;
|
|
946
|
-
int sumi1 = 0;
|
|
947
|
-
|
|
948
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
949
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
950
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
951
|
-
|
|
952
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
953
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
954
|
-
|
|
955
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
956
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
957
|
-
}
|
|
958
|
-
|
|
959
|
-
int sumi = sumi0 + sumi1;
|
|
960
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
961
|
-
}
|
|
962
|
-
|
|
963
|
-
*s = sumf;
|
|
964
913
|
}
|
|
965
914
|
|
|
966
915
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1017,7 +966,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1017
966
|
}
|
|
1018
967
|
|
|
1019
968
|
sumf = hsum_float_8(accum);
|
|
1020
|
-
|
|
1021
969
|
#endif
|
|
1022
970
|
for (; ib < nb; ++ib) {
|
|
1023
971
|
int sumi = 0;
|
|
@@ -1157,44 +1105,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1157
1105
|
*s = hsum_float_8(sumf);
|
|
1158
1106
|
|
|
1159
1107
|
#else
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
for (int i = 0; i < nb; ++i) {
|
|
1165
|
-
int sum = 0;
|
|
1166
|
-
|
|
1167
|
-
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
|
1168
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1169
|
-
for (size_t m = 0; m < 32; ++m) {
|
|
1170
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1171
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1172
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
|
1173
|
-
}
|
|
1174
|
-
}
|
|
1175
|
-
}
|
|
1176
|
-
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
|
1177
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1178
|
-
for (size_t m = 0; m < 16; ++m) {
|
|
1179
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1180
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1181
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
|
1182
|
-
}
|
|
1183
|
-
}
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1187
|
-
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
|
1188
|
-
uint8_t q = x[i].qh[j] * pow3[l];
|
|
1189
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1190
|
-
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
|
1191
|
-
}
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
1195
|
-
}
|
|
1196
|
-
|
|
1197
|
-
*s = sumf;
|
|
1108
|
+
UNUSED(x);
|
|
1109
|
+
UNUSED(y);
|
|
1110
|
+
UNUSED(nb);
|
|
1111
|
+
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1198
1112
|
#endif
|
|
1199
1113
|
}
|
|
1200
1114
|
|
|
@@ -1257,25 +1171,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1257
1171
|
*s = hsum_float_8(sumf);
|
|
1258
1172
|
|
|
1259
1173
|
#else
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
|
1266
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1267
|
-
for (size_t k = 0; k < 32; ++k) {
|
|
1268
|
-
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
|
1269
|
-
}
|
|
1270
|
-
}
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1274
|
-
|
|
1275
|
-
sumf += (float) sumi * d;
|
|
1276
|
-
}
|
|
1277
|
-
|
|
1278
|
-
*s = sumf;
|
|
1174
|
+
UNUSED(x);
|
|
1175
|
+
UNUSED(y);
|
|
1176
|
+
UNUSED(nb);
|
|
1177
|
+
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1279
1178
|
#endif
|
|
1280
1179
|
}
|
|
1281
1180
|
|
|
@@ -1464,45 +1363,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1464
1363
|
*s = hsum_float_8(acc);
|
|
1465
1364
|
|
|
1466
1365
|
#else
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
const uint8_t * q2 = x[i].qs;
|
|
1473
|
-
const int8_t * q8 = y[i].qs;
|
|
1474
|
-
const uint8_t * sc = x[i].scales;
|
|
1475
|
-
|
|
1476
|
-
int summs = 0;
|
|
1477
|
-
for (int j = 0; j < 16; ++j) {
|
|
1478
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1479
|
-
}
|
|
1480
|
-
|
|
1481
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1482
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1483
|
-
|
|
1484
|
-
int isum = 0;
|
|
1485
|
-
int is = 0;
|
|
1486
|
-
int d;
|
|
1487
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
1488
|
-
int shift = 0;
|
|
1489
|
-
for (int j = 0; j < 4; ++j) {
|
|
1490
|
-
d = sc[is++] & 0xF;
|
|
1491
|
-
int isuml = 0;
|
|
1492
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1493
|
-
isum += d * isuml;
|
|
1494
|
-
d = sc[is++] & 0xF;
|
|
1495
|
-
isuml = 0;
|
|
1496
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1497
|
-
isum += d * isuml;
|
|
1498
|
-
shift += 2;
|
|
1499
|
-
q8 += 32;
|
|
1500
|
-
}
|
|
1501
|
-
q2 += 32;
|
|
1502
|
-
}
|
|
1503
|
-
sumf += dall * isum - dmin * summs;
|
|
1504
|
-
}
|
|
1505
|
-
*s = sumf;
|
|
1366
|
+
UNUSED(x);
|
|
1367
|
+
UNUSED(y);
|
|
1368
|
+
UNUSED(nb);
|
|
1369
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1506
1370
|
#endif
|
|
1507
1371
|
}
|
|
1508
1372
|
|
|
@@ -1769,70 +1633,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1769
1633
|
*s = hsum_float_8(acc);
|
|
1770
1634
|
|
|
1771
1635
|
#else
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
1779
|
-
|
|
1780
|
-
int8_t aux8[QK_K];
|
|
1781
|
-
int16_t aux16[8];
|
|
1782
|
-
float sums [8];
|
|
1783
|
-
int32_t aux32[8];
|
|
1784
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1785
|
-
|
|
1786
|
-
uint32_t auxs[4];
|
|
1787
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
1788
|
-
|
|
1789
|
-
float sumf = 0;
|
|
1790
|
-
for (int i = 0; i < nb; ++i) {
|
|
1791
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
1792
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
1793
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1794
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1795
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1796
|
-
uint8_t m = 1;
|
|
1797
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1798
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
1799
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1800
|
-
a += 32; m <<= 1;
|
|
1801
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
1802
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1803
|
-
a += 32; m <<= 1;
|
|
1804
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
1805
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1806
|
-
a += 32; m <<= 1;
|
|
1807
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
1808
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1809
|
-
a += 32; m <<= 1;
|
|
1810
|
-
q3 += 32;
|
|
1811
|
-
}
|
|
1812
|
-
a = aux8;
|
|
1813
|
-
|
|
1814
|
-
memcpy(auxs, x[i].scales, 12);
|
|
1815
|
-
uint32_t tmp = auxs[2];
|
|
1816
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
1817
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
1818
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
1819
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
1820
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1821
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1822
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1823
|
-
q8 += 8; a += 8;
|
|
1824
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1825
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1826
|
-
q8 += 8; a += 8;
|
|
1827
|
-
}
|
|
1828
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1829
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1830
|
-
}
|
|
1831
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1832
|
-
*s = sumf;
|
|
1833
|
-
|
|
1636
|
+
UNUSED(kmask1);
|
|
1637
|
+
UNUSED(kmask2);
|
|
1638
|
+
UNUSED(x);
|
|
1639
|
+
UNUSED(y);
|
|
1640
|
+
UNUSED(nb);
|
|
1641
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1834
1642
|
#endif
|
|
1835
|
-
|
|
1836
1643
|
}
|
|
1837
1644
|
|
|
1838
1645
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2002,61 +1809,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2002
1809
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
|
2003
1810
|
|
|
2004
1811
|
#else
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2014
|
-
|
|
2015
|
-
float sumf = 0;
|
|
2016
|
-
for (int i = 0; i < nb; ++i) {
|
|
2017
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
2018
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2019
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2020
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2021
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2022
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2023
|
-
a += 32;
|
|
2024
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2025
|
-
a += 32; q4 += 32;
|
|
2026
|
-
}
|
|
2027
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2028
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2029
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2030
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2031
|
-
utmp[2] = uaux;
|
|
2032
|
-
utmp[0] &= kmask1;
|
|
2033
|
-
|
|
2034
|
-
int sumi = 0;
|
|
2035
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2036
|
-
a = aux8;
|
|
2037
|
-
int is = 0;
|
|
2038
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2039
|
-
int32_t scale = scales[is++];
|
|
2040
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2041
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2042
|
-
q8 += 8; a += 8;
|
|
2043
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2044
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2045
|
-
q8 += 8; a += 8;
|
|
2046
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2047
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2048
|
-
q8 += 8; a += 8;
|
|
2049
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2050
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2051
|
-
q8 += 8; a += 8;
|
|
2052
|
-
}
|
|
2053
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2054
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2055
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2056
|
-
sumf -= dmin * sumi;
|
|
2057
|
-
}
|
|
2058
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2059
|
-
*s = sumf;
|
|
1812
|
+
UNUSED(x);
|
|
1813
|
+
UNUSED(y);
|
|
1814
|
+
UNUSED(nb);
|
|
1815
|
+
UNUSED(kmask1);
|
|
1816
|
+
UNUSED(kmask2);
|
|
1817
|
+
UNUSED(kmask3);
|
|
1818
|
+
UNUSED(utmp);
|
|
1819
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2060
1820
|
#endif
|
|
2061
1821
|
}
|
|
2062
1822
|
|
|
@@ -2259,66 +2019,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2259
2019
|
*s = hsum_float_8(acc) + summs;
|
|
2260
2020
|
|
|
2261
2021
|
#else
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2271
|
-
|
|
2272
|
-
float sumf = 0;
|
|
2273
|
-
for (int i = 0; i < nb; ++i) {
|
|
2274
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
2275
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
2276
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2277
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2278
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2279
|
-
uint8_t m = 1;
|
|
2280
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2281
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2282
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2283
|
-
a += 32; m <<= 1;
|
|
2284
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2285
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2286
|
-
a += 32; m <<= 1;
|
|
2287
|
-
q4 += 32;
|
|
2288
|
-
}
|
|
2289
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2290
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2291
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2292
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2293
|
-
utmp[2] = uaux;
|
|
2294
|
-
utmp[0] &= kmask1;
|
|
2295
|
-
|
|
2296
|
-
int sumi = 0;
|
|
2297
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2298
|
-
a = aux8;
|
|
2299
|
-
int is = 0;
|
|
2300
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2301
|
-
int32_t scale = scales[is++];
|
|
2302
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2303
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2304
|
-
q8 += 8; a += 8;
|
|
2305
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2306
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2307
|
-
q8 += 8; a += 8;
|
|
2308
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2309
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2310
|
-
q8 += 8; a += 8;
|
|
2311
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2312
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2313
|
-
q8 += 8; a += 8;
|
|
2314
|
-
}
|
|
2315
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2316
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2317
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2318
|
-
sumf -= dmin * sumi;
|
|
2319
|
-
}
|
|
2320
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2321
|
-
*s = sumf;
|
|
2022
|
+
UNUSED(x);
|
|
2023
|
+
UNUSED(y);
|
|
2024
|
+
UNUSED(nb);
|
|
2025
|
+
UNUSED(kmask1);
|
|
2026
|
+
UNUSED(kmask2);
|
|
2027
|
+
UNUSED(kmask3);
|
|
2028
|
+
UNUSED(utmp);
|
|
2029
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2322
2030
|
#endif
|
|
2323
2031
|
}
|
|
2324
2032
|
|
|
@@ -2520,47 +2228,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2520
2228
|
*s = hsum_float_8(acc);
|
|
2521
2229
|
|
|
2522
2230
|
#else
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
int32_t aux32[8];
|
|
2528
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2529
|
-
|
|
2530
|
-
float sumf = 0;
|
|
2531
|
-
for (int i = 0; i < nb; ++i) {
|
|
2532
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
2533
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2534
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2535
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2536
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2537
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
2538
|
-
for (int l = 0; l < 32; ++l) {
|
|
2539
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
2540
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
2541
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
2542
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
2543
|
-
}
|
|
2544
|
-
a += 128;
|
|
2545
|
-
q4 += 64;
|
|
2546
|
-
qh += 32;
|
|
2547
|
-
}
|
|
2548
|
-
a = aux8;
|
|
2549
|
-
int is = 0;
|
|
2550
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
2551
|
-
int scale = x[i].scales[is++];
|
|
2552
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2553
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2554
|
-
q8 += 8; a += 8;
|
|
2555
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2556
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2557
|
-
q8 += 8; a += 8;
|
|
2558
|
-
}
|
|
2559
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2560
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2561
|
-
}
|
|
2562
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2563
|
-
*s = sumf;
|
|
2231
|
+
UNUSED(x);
|
|
2232
|
+
UNUSED(y);
|
|
2233
|
+
UNUSED(nb);
|
|
2234
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2564
2235
|
#endif
|
|
2565
2236
|
}
|
|
2566
2237
|
|
|
@@ -2712,34 +2383,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2712
2383
|
*s = 0.125f * hsum_float_8(accumf);
|
|
2713
2384
|
|
|
2714
2385
|
#else
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
float sumf = 0.f;
|
|
2720
|
-
for (int i = 0; i < nb; ++i) {
|
|
2721
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2722
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
2723
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2724
|
-
int32_t bsum = 0;
|
|
2725
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2726
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
2727
|
-
q2 += 4;
|
|
2728
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
2729
|
-
int32_t sumi = 0;
|
|
2730
|
-
for (int l = 0; l < 4; ++l) {
|
|
2731
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
2732
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
2733
|
-
for (int j = 0; j < 8; ++j) {
|
|
2734
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
2735
|
-
}
|
|
2736
|
-
q8 += 8;
|
|
2737
|
-
}
|
|
2738
|
-
bsum += sumi * ls;
|
|
2739
|
-
}
|
|
2740
|
-
sumf += d * bsum;
|
|
2741
|
-
}
|
|
2742
|
-
*s = 0.125f * sumf;
|
|
2386
|
+
UNUSED(x);
|
|
2387
|
+
UNUSED(y);
|
|
2388
|
+
UNUSED(nb);
|
|
2389
|
+
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2743
2390
|
#endif
|
|
2744
2391
|
}
|
|
2745
2392
|
|
|
@@ -3033,42 +2680,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
3033
2680
|
*s = 0.125f * hsum_float_8(accumf);
|
|
3034
2681
|
|
|
3035
2682
|
#else
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3040
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3041
|
-
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
3042
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3043
|
-
int32_t bsum = 0;
|
|
3044
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3045
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
3046
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
3047
|
-
int32_t sumi = 0;
|
|
3048
|
-
for (int l = 0; l < 2; ++l) {
|
|
3049
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3050
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3051
|
-
for (int j = 0; j < 8; ++j) {
|
|
3052
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3053
|
-
}
|
|
3054
|
-
q8 += 8;
|
|
3055
|
-
}
|
|
3056
|
-
bsum += sumi * ls1;
|
|
3057
|
-
sumi = 0;
|
|
3058
|
-
for (int l = 2; l < 4; ++l) {
|
|
3059
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3060
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3061
|
-
for (int j = 0; j < 8; ++j) {
|
|
3062
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3063
|
-
}
|
|
3064
|
-
q8 += 8;
|
|
3065
|
-
}
|
|
3066
|
-
bsum += sumi * ls2;
|
|
3067
|
-
q2 += 4;
|
|
3068
|
-
}
|
|
3069
|
-
sumf += d * bsum;
|
|
3070
|
-
}
|
|
3071
|
-
*s = 0.125f * sumf;
|
|
2683
|
+
UNUSED(x);
|
|
2684
|
+
UNUSED(y);
|
|
2685
|
+
UNUSED(nb);
|
|
2686
|
+
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3072
2687
|
#endif
|
|
3073
2688
|
}
|
|
3074
2689
|
|
|
@@ -3250,47 +2865,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3250
2865
|
*s = 0.125f * hsum_float_8(accumf);
|
|
3251
2866
|
|
|
3252
2867
|
#else
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3258
|
-
const int8_t * q8 = y[i].qs;
|
|
3259
|
-
const uint8_t * qs = x[i].qs;
|
|
3260
|
-
const uint8_t * qh = x[i].qh;
|
|
3261
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
3262
|
-
|
|
3263
|
-
int bsum = 0;
|
|
3264
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3265
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
3266
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
3267
|
-
int sumi1 = 0, sumi2 = 0;
|
|
3268
|
-
for (int l = 0; l < 2; ++l) {
|
|
3269
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3270
|
-
for (int j = 0; j < 8; ++j) {
|
|
3271
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3272
|
-
}
|
|
3273
|
-
q8 += 8;
|
|
3274
|
-
}
|
|
3275
|
-
for (int l = 2; l < 4; ++l) {
|
|
3276
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3277
|
-
for (int j = 0; j < 8; ++j) {
|
|
3278
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3279
|
-
}
|
|
3280
|
-
q8 += 8;
|
|
3281
|
-
}
|
|
3282
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
3283
|
-
qs += 4;
|
|
3284
|
-
signs += 4;
|
|
3285
|
-
}
|
|
3286
|
-
|
|
3287
|
-
sumf += d * bsum;
|
|
3288
|
-
}
|
|
3289
|
-
|
|
3290
|
-
*s = 0.125f * sumf;
|
|
3291
|
-
|
|
2868
|
+
UNUSED(x);
|
|
2869
|
+
UNUSED(y);
|
|
2870
|
+
UNUSED(nb);
|
|
2871
|
+
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3292
2872
|
#endif
|
|
3293
|
-
|
|
3294
2873
|
}
|
|
3295
2874
|
|
|
3296
2875
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -3410,36 +2989,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3410
2989
|
*s = 0.25f * hsum_float_8(accumf);
|
|
3411
2990
|
|
|
3412
2991
|
#else
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
for (int i = 0; i < nb; ++i) {
|
|
3418
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3419
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3420
|
-
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3421
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3422
|
-
int32_t bsum = 0;
|
|
3423
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3424
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
3425
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
3426
|
-
int32_t sumi = 0;
|
|
3427
|
-
for (int l = 0; l < 4; ++l) {
|
|
3428
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
3429
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
3430
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
3431
|
-
for (int j = 0; j < 4; ++j) {
|
|
3432
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3433
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3434
|
-
}
|
|
3435
|
-
q8 += 8;
|
|
3436
|
-
}
|
|
3437
|
-
q3 += 8;
|
|
3438
|
-
bsum += sumi * ls;
|
|
3439
|
-
}
|
|
3440
|
-
sumf += d * bsum;
|
|
3441
|
-
}
|
|
3442
|
-
*s = 0.25f * sumf;
|
|
2992
|
+
UNUSED(x);
|
|
2993
|
+
UNUSED(y);
|
|
2994
|
+
UNUSED(nb);
|
|
2995
|
+
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3443
2996
|
#endif
|
|
3444
2997
|
}
|
|
3445
2998
|
|
|
@@ -3646,48 +3199,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3646
3199
|
*s = hsum_float_8(accumf);
|
|
3647
3200
|
|
|
3648
3201
|
#else
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3654
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3655
|
-
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
3656
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3657
|
-
int32_t bsum = 0;
|
|
3658
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
3659
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
3660
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
3661
|
-
int32_t sumi = 0;
|
|
3662
|
-
for (int l = 0; l < 4; ++l) {
|
|
3663
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
3664
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
3665
|
-
for (int j = 0; j < 4; ++j) {
|
|
3666
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3667
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3668
|
-
}
|
|
3669
|
-
q8 += 8;
|
|
3670
|
-
}
|
|
3671
|
-
qs += 8;
|
|
3672
|
-
signs += 4;
|
|
3673
|
-
bsum += sumi * ls1;
|
|
3674
|
-
sumi = 0;
|
|
3675
|
-
for (int l = 0; l < 4; ++l) {
|
|
3676
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
3677
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
3678
|
-
for (int j = 0; j < 4; ++j) {
|
|
3679
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3680
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3681
|
-
}
|
|
3682
|
-
q8 += 8;
|
|
3683
|
-
}
|
|
3684
|
-
qs += 8;
|
|
3685
|
-
signs += 4;
|
|
3686
|
-
bsum += sumi * ls2;
|
|
3687
|
-
}
|
|
3688
|
-
sumf += d * bsum;
|
|
3689
|
-
}
|
|
3690
|
-
*s = sumf;
|
|
3202
|
+
UNUSED(x);
|
|
3203
|
+
UNUSED(y);
|
|
3204
|
+
UNUSED(nb);
|
|
3205
|
+
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3691
3206
|
#endif
|
|
3692
3207
|
}
|
|
3693
3208
|
|
|
@@ -3811,36 +3326,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3811
3326
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
|
3812
3327
|
|
|
3813
3328
|
#else
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
3817
|
-
|
|
3818
|
-
const int8_t * q8 = y[i].qs;
|
|
3819
|
-
const uint8_t * qs = x[i].qs;
|
|
3820
|
-
const uint16_t * qh = x[i].qh;
|
|
3821
|
-
|
|
3822
|
-
int sumi = 0, sumi1 = 0;
|
|
3823
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
3824
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
3825
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
3826
|
-
int lsum = 0;
|
|
3827
|
-
for (int l = 0; l < 4; ++l) {
|
|
3828
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
3829
|
-
for (int j = 0; j < 8; ++j) {
|
|
3830
|
-
lsum += q8[j] * grid[j];
|
|
3831
|
-
}
|
|
3832
|
-
q8 += 8;
|
|
3833
|
-
}
|
|
3834
|
-
sumi += ls * lsum;
|
|
3835
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
3836
|
-
qs += 4;
|
|
3837
|
-
}
|
|
3838
|
-
|
|
3839
|
-
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
3840
|
-
}
|
|
3841
|
-
|
|
3842
|
-
*s = sumf;
|
|
3843
|
-
|
|
3329
|
+
UNUSED(x);
|
|
3330
|
+
UNUSED(y);
|
|
3331
|
+
UNUSED(nb);
|
|
3332
|
+
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3844
3333
|
#endif
|
|
3845
3334
|
}
|
|
3846
3335
|
|
|
@@ -4043,52 +3532,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
4043
3532
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
|
4044
3533
|
|
|
4045
3534
|
#else
|
|
4046
|
-
|
|
4047
|
-
|
|
4048
|
-
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
|
|
4052
|
-
const int8_t * q8 = y[i].qs;
|
|
4053
|
-
const uint8_t * qs = x[i].qs;
|
|
4054
|
-
const uint8_t * qh = x[i].qh;
|
|
4055
|
-
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
4056
|
-
|
|
4057
|
-
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
4058
|
-
|
|
4059
|
-
int sumi1 = 0, sumi2 = 0;
|
|
4060
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
4061
|
-
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
|
4062
|
-
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
|
4063
|
-
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
|
4064
|
-
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
|
4065
|
-
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
|
4066
|
-
for (int l = 0; l < 4; ++l) {
|
|
4067
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
|
4068
|
-
int lsum1 = 0, lsum2 = 0;
|
|
4069
|
-
for (int j = 0; j < 8; ++j) {
|
|
4070
|
-
lsum1 += q8[j] * grid[j];
|
|
4071
|
-
lsum2 += q8[j];
|
|
4072
|
-
}
|
|
4073
|
-
q8 += 8;
|
|
4074
|
-
sum1[l/2] += lsum1;
|
|
4075
|
-
sum2[l/2] += lsum2*delta[l];
|
|
4076
|
-
}
|
|
4077
|
-
|
|
4078
|
-
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
|
4079
|
-
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
|
4080
|
-
|
|
4081
|
-
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
|
4082
|
-
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
|
4083
|
-
qs += 4;
|
|
4084
|
-
qh += 2;
|
|
4085
|
-
}
|
|
4086
|
-
|
|
4087
|
-
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
4088
|
-
}
|
|
4089
|
-
|
|
4090
|
-
*s = sumf;
|
|
4091
|
-
|
|
3535
|
+
UNUSED(x);
|
|
3536
|
+
UNUSED(y);
|
|
3537
|
+
UNUSED(nb);
|
|
3538
|
+
UNUSED(scale);
|
|
3539
|
+
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
4092
3540
|
#endif
|
|
4093
3541
|
}
|
|
4094
3542
|
|
|
@@ -4275,37 +3723,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4275
3723
|
*s = hsum_float_8(accum);
|
|
4276
3724
|
|
|
4277
3725
|
#else
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4282
|
-
const uint8_t * qs = x[ibl].qs;
|
|
4283
|
-
const int8_t * q8 = y[ibl].qs;
|
|
4284
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
4285
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
4286
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
4287
|
-
h >>= 4;
|
|
4288
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
4289
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
4290
|
-
int sumi1 = 0, sumi2 = 0;
|
|
4291
|
-
for (int j = 0; j < 16; ++j) {
|
|
4292
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4293
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4294
|
-
}
|
|
4295
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
4296
|
-
qs += 16;
|
|
4297
|
-
q8 += 32;
|
|
4298
|
-
sumi1 = sumi2 = 0;
|
|
4299
|
-
for (int j = 0; j < 16; ++j) {
|
|
4300
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4301
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4302
|
-
}
|
|
4303
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
4304
|
-
qs += 16;
|
|
4305
|
-
q8 += 32;
|
|
4306
|
-
}
|
|
4307
|
-
}
|
|
4308
|
-
*s = sumf;
|
|
3726
|
+
UNUSED(x);
|
|
3727
|
+
UNUSED(y);
|
|
3728
|
+
UNUSED(nb);
|
|
3729
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
4309
3730
|
#endif
|
|
4310
3731
|
}
|
|
4311
3732
|
|