@fugood/llama.node 1.1.1 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -0
- package/lib/index.js +3 -0
- package/lib/index.ts +6 -0
- package/package.json +14 -14
- package/src/LlamaCompletionWorker.cpp +3 -2
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +50 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
- package/src/tts_utils.h +3 -3
|
@@ -821,24 +821,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
821
821
|
|
|
822
822
|
sumf = hsum_float_8(acc) + summs;
|
|
823
823
|
|
|
824
|
-
#endif
|
|
825
|
-
for (; ib < nb; ++ib) {
|
|
826
|
-
int sumi0 = 0;
|
|
827
|
-
int sumi1 = 0;
|
|
828
|
-
|
|
829
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
830
|
-
const int v0 = (x[ib].qs[j] & 0x0F);
|
|
831
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
|
832
|
-
|
|
833
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
834
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
int sumi = sumi0 + sumi1;
|
|
838
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
839
|
-
}
|
|
840
|
-
|
|
841
824
|
*s = sumf;
|
|
825
|
+
#else
|
|
826
|
+
UNUSED(nb);
|
|
827
|
+
UNUSED(x);
|
|
828
|
+
UNUSED(y);
|
|
829
|
+
UNUSED(ib);
|
|
830
|
+
UNUSED(sumf);
|
|
831
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
832
|
+
#endif
|
|
842
833
|
}
|
|
843
834
|
|
|
844
835
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -883,30 +874,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
883
874
|
|
|
884
875
|
sumf = hsum_float_8(acc);
|
|
885
876
|
|
|
886
|
-
#endif
|
|
887
|
-
for (; ib < nb; ++ib) {
|
|
888
|
-
uint32_t qh;
|
|
889
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
890
|
-
|
|
891
|
-
int sumi0 = 0;
|
|
892
|
-
int sumi1 = 0;
|
|
893
|
-
|
|
894
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
895
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
896
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
897
|
-
|
|
898
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
899
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
900
|
-
|
|
901
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
902
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
903
|
-
}
|
|
904
|
-
|
|
905
|
-
int sumi = sumi0 + sumi1;
|
|
906
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
907
|
-
}
|
|
908
|
-
|
|
909
877
|
*s = sumf;
|
|
878
|
+
#else
|
|
879
|
+
UNUSED(nb);
|
|
880
|
+
UNUSED(ib);
|
|
881
|
+
UNUSED(sumf);
|
|
882
|
+
UNUSED(x);
|
|
883
|
+
UNUSED(y);
|
|
884
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
885
|
+
#endif
|
|
910
886
|
}
|
|
911
887
|
|
|
912
888
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -954,30 +930,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
954
930
|
|
|
955
931
|
sumf = hsum_float_8(acc) + summs;
|
|
956
932
|
|
|
957
|
-
#endif
|
|
958
|
-
for (; ib < nb; ++ib) {
|
|
959
|
-
uint32_t qh;
|
|
960
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
961
|
-
|
|
962
|
-
int sumi0 = 0;
|
|
963
|
-
int sumi1 = 0;
|
|
964
|
-
|
|
965
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
966
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
967
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
968
|
-
|
|
969
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
970
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
971
|
-
|
|
972
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
973
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
974
|
-
}
|
|
975
|
-
|
|
976
|
-
int sumi = sumi0 + sumi1;
|
|
977
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
978
|
-
}
|
|
979
|
-
|
|
980
933
|
*s = sumf;
|
|
934
|
+
#else
|
|
935
|
+
UNUSED(nb);
|
|
936
|
+
UNUSED(ib);
|
|
937
|
+
UNUSED(sumf);
|
|
938
|
+
UNUSED(x);
|
|
939
|
+
UNUSED(y);
|
|
940
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
941
|
+
#endif
|
|
981
942
|
}
|
|
982
943
|
|
|
983
944
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1016,18 +977,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1016
977
|
|
|
1017
978
|
sumf = hsum_float_8(acc);
|
|
1018
979
|
|
|
1019
|
-
#endif
|
|
1020
|
-
for (; ib < nb; ++ib) {
|
|
1021
|
-
int sumi = 0;
|
|
1022
|
-
|
|
1023
|
-
for (int j = 0; j < qk; j++) {
|
|
1024
|
-
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
1025
|
-
}
|
|
1026
|
-
|
|
1027
|
-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
980
|
*s = sumf;
|
|
981
|
+
#else
|
|
982
|
+
UNUSED(nb);
|
|
983
|
+
UNUSED(ib);
|
|
984
|
+
UNUSED(sumf);
|
|
985
|
+
UNUSED(x);
|
|
986
|
+
UNUSED(y);
|
|
987
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
988
|
+
#endif
|
|
1031
989
|
}
|
|
1032
990
|
|
|
1033
991
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1103,45 +1061,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1103
1061
|
*s = hsum_float_8(acc);
|
|
1104
1062
|
|
|
1105
1063
|
#else
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
const uint8_t * q2 = x[i].qs;
|
|
1112
|
-
const int8_t * q8 = y[i].qs;
|
|
1113
|
-
const uint8_t * sc = x[i].scales;
|
|
1114
|
-
|
|
1115
|
-
int summs = 0;
|
|
1116
|
-
for (int j = 0; j < 16; ++j) {
|
|
1117
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1118
|
-
}
|
|
1119
|
-
|
|
1120
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1121
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1122
|
-
|
|
1123
|
-
int isum = 0;
|
|
1124
|
-
int is = 0;
|
|
1125
|
-
int d;
|
|
1126
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
1127
|
-
int shift = 0;
|
|
1128
|
-
for (int j = 0; j < 4; ++j) {
|
|
1129
|
-
d = sc[is++] & 0xF;
|
|
1130
|
-
int isuml = 0;
|
|
1131
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1132
|
-
isum += d * isuml;
|
|
1133
|
-
d = sc[is++] & 0xF;
|
|
1134
|
-
isuml = 0;
|
|
1135
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1136
|
-
isum += d * isuml;
|
|
1137
|
-
shift += 2;
|
|
1138
|
-
q8 += 32;
|
|
1139
|
-
}
|
|
1140
|
-
q2 += 32;
|
|
1141
|
-
}
|
|
1142
|
-
sumf += dall * isum - dmin * summs;
|
|
1143
|
-
}
|
|
1144
|
-
*s = sumf;
|
|
1064
|
+
UNUSED(x);
|
|
1065
|
+
UNUSED(y);
|
|
1066
|
+
UNUSED(nb);
|
|
1067
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1145
1068
|
#endif
|
|
1146
1069
|
}
|
|
1147
1070
|
|
|
@@ -1239,70 +1162,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1239
1162
|
*s = hsum_float_8(acc);
|
|
1240
1163
|
|
|
1241
1164
|
#else
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
1249
|
-
|
|
1250
|
-
int8_t aux8[QK_K];
|
|
1251
|
-
int16_t aux16[8];
|
|
1252
|
-
float sums [8];
|
|
1253
|
-
int32_t aux32[8];
|
|
1254
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1255
|
-
|
|
1256
|
-
uint32_t auxs[4];
|
|
1257
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
1258
|
-
|
|
1259
|
-
float sumf = 0;
|
|
1260
|
-
for (int i = 0; i < nb; ++i) {
|
|
1261
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
1262
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
1263
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1264
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1265
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1266
|
-
uint8_t m = 1;
|
|
1267
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1268
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
1269
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1270
|
-
a += 32; m <<= 1;
|
|
1271
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
1272
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1273
|
-
a += 32; m <<= 1;
|
|
1274
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
1275
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1276
|
-
a += 32; m <<= 1;
|
|
1277
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
1278
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1279
|
-
a += 32; m <<= 1;
|
|
1280
|
-
q3 += 32;
|
|
1281
|
-
}
|
|
1282
|
-
a = aux8;
|
|
1283
|
-
|
|
1284
|
-
memcpy(auxs, x[i].scales, 12);
|
|
1285
|
-
uint32_t tmp = auxs[2];
|
|
1286
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
1287
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
1288
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
1289
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
1290
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1291
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1292
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1293
|
-
q8 += 8; a += 8;
|
|
1294
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1295
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1296
|
-
q8 += 8; a += 8;
|
|
1297
|
-
}
|
|
1298
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1299
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1300
|
-
}
|
|
1301
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1302
|
-
*s = sumf;
|
|
1303
|
-
|
|
1165
|
+
UNUSED(kmask1);
|
|
1166
|
+
UNUSED(kmask2);
|
|
1167
|
+
UNUSED(x);
|
|
1168
|
+
UNUSED(y);
|
|
1169
|
+
UNUSED(nb);
|
|
1170
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1304
1171
|
#endif
|
|
1305
|
-
|
|
1306
1172
|
}
|
|
1307
1173
|
|
|
1308
1174
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1391,61 +1257,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1391
1257
|
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
1392
1258
|
|
|
1393
1259
|
#else
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1403
|
-
|
|
1404
|
-
float sumf = 0;
|
|
1405
|
-
for (int i = 0; i < nb; ++i) {
|
|
1406
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1407
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1408
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1409
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1410
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1411
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1412
|
-
a += 32;
|
|
1413
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1414
|
-
a += 32; q4 += 32;
|
|
1415
|
-
}
|
|
1416
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1417
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1418
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1419
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1420
|
-
utmp[2] = uaux;
|
|
1421
|
-
utmp[0] &= kmask1;
|
|
1422
|
-
|
|
1423
|
-
int sumi = 0;
|
|
1424
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1425
|
-
a = aux8;
|
|
1426
|
-
int is = 0;
|
|
1427
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1428
|
-
int32_t scale = scales[is++];
|
|
1429
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1430
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1431
|
-
q8 += 8; a += 8;
|
|
1432
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1433
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1434
|
-
q8 += 8; a += 8;
|
|
1435
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1436
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1437
|
-
q8 += 8; a += 8;
|
|
1438
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1439
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1440
|
-
q8 += 8; a += 8;
|
|
1441
|
-
}
|
|
1442
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1443
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1444
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1445
|
-
sumf -= dmin * sumi;
|
|
1446
|
-
}
|
|
1447
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1448
|
-
*s = sumf;
|
|
1260
|
+
UNUSED(x);
|
|
1261
|
+
UNUSED(y);
|
|
1262
|
+
UNUSED(nb);
|
|
1263
|
+
UNUSED(kmask1);
|
|
1264
|
+
UNUSED(kmask2);
|
|
1265
|
+
UNUSED(kmask3);
|
|
1266
|
+
UNUSED(utmp);
|
|
1267
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1449
1268
|
#endif
|
|
1450
1269
|
}
|
|
1451
1270
|
|
|
@@ -1541,66 +1360,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1541
1360
|
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
1542
1361
|
|
|
1543
1362
|
#else
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1553
|
-
|
|
1554
|
-
float sumf = 0;
|
|
1555
|
-
for (int i = 0; i < nb; ++i) {
|
|
1556
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1557
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
1558
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1559
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1560
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1561
|
-
uint8_t m = 1;
|
|
1562
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1563
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1564
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1565
|
-
a += 32; m <<= 1;
|
|
1566
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1567
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1568
|
-
a += 32; m <<= 1;
|
|
1569
|
-
q4 += 32;
|
|
1570
|
-
}
|
|
1571
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1572
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1573
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1574
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1575
|
-
utmp[2] = uaux;
|
|
1576
|
-
utmp[0] &= kmask1;
|
|
1577
|
-
|
|
1578
|
-
int sumi = 0;
|
|
1579
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1580
|
-
a = aux8;
|
|
1581
|
-
int is = 0;
|
|
1582
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1583
|
-
int32_t scale = scales[is++];
|
|
1584
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1585
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1586
|
-
q8 += 8; a += 8;
|
|
1587
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1588
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1589
|
-
q8 += 8; a += 8;
|
|
1590
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1591
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1592
|
-
q8 += 8; a += 8;
|
|
1593
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1594
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1595
|
-
q8 += 8; a += 8;
|
|
1596
|
-
}
|
|
1597
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1598
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1599
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1600
|
-
sumf -= dmin * sumi;
|
|
1601
|
-
}
|
|
1602
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1603
|
-
*s = sumf;
|
|
1363
|
+
UNUSED(x);
|
|
1364
|
+
UNUSED(y);
|
|
1365
|
+
UNUSED(nb);
|
|
1366
|
+
UNUSED(kmask1);
|
|
1367
|
+
UNUSED(kmask2);
|
|
1368
|
+
UNUSED(kmask3);
|
|
1369
|
+
UNUSED(utmp);
|
|
1370
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1604
1371
|
#endif
|
|
1605
1372
|
}
|
|
1606
1373
|
|
|
@@ -1678,47 +1445,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1678
1445
|
*s = hsum_float_8(acc);
|
|
1679
1446
|
|
|
1680
1447
|
#else
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
int32_t aux32[8];
|
|
1686
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1687
|
-
|
|
1688
|
-
float sumf = 0;
|
|
1689
|
-
for (int i = 0; i < nb; ++i) {
|
|
1690
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
1691
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
1692
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1693
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1694
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1695
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1696
|
-
for (int l = 0; l < 32; ++l) {
|
|
1697
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
1698
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
1699
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
1700
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
1701
|
-
}
|
|
1702
|
-
a += 128;
|
|
1703
|
-
q4 += 64;
|
|
1704
|
-
qh += 32;
|
|
1705
|
-
}
|
|
1706
|
-
a = aux8;
|
|
1707
|
-
int is = 0;
|
|
1708
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1709
|
-
int scale = x[i].scales[is++];
|
|
1710
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1711
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1712
|
-
q8 += 8; a += 8;
|
|
1713
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1714
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1715
|
-
q8 += 8; a += 8;
|
|
1716
|
-
}
|
|
1717
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1718
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1719
|
-
}
|
|
1720
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1721
|
-
*s = sumf;
|
|
1448
|
+
UNUSED(x);
|
|
1449
|
+
UNUSED(y);
|
|
1450
|
+
UNUSED(nb);
|
|
1451
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1722
1452
|
#endif
|
|
1723
1453
|
}
|
|
1724
1454
|
|
|
@@ -1815,34 +1545,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1815
1545
|
*s = 0.125f * hsum_float_8(accumf);
|
|
1816
1546
|
|
|
1817
1547
|
#else
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
float sumf = 0.f;
|
|
1823
|
-
for (int i = 0; i < nb; ++i) {
|
|
1824
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1825
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1826
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1827
|
-
int32_t bsum = 0;
|
|
1828
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
1829
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
1830
|
-
q2 += 4;
|
|
1831
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
1832
|
-
int32_t sumi = 0;
|
|
1833
|
-
for (int l = 0; l < 4; ++l) {
|
|
1834
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
1835
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
1836
|
-
for (int j = 0; j < 8; ++j) {
|
|
1837
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1838
|
-
}
|
|
1839
|
-
q8 += 8;
|
|
1840
|
-
}
|
|
1841
|
-
bsum += sumi * ls;
|
|
1842
|
-
}
|
|
1843
|
-
sumf += d * bsum;
|
|
1844
|
-
}
|
|
1845
|
-
*s = 0.125f * sumf;
|
|
1548
|
+
UNUSED(x);
|
|
1549
|
+
UNUSED(y);
|
|
1550
|
+
UNUSED(nb);
|
|
1551
|
+
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1846
1552
|
#endif
|
|
1847
1553
|
}
|
|
1848
1554
|
|
|
@@ -1978,42 +1684,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1978
1684
|
*s = 0.125f * hsum_float_8(accumf);
|
|
1979
1685
|
|
|
1980
1686
|
#else
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1986
|
-
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
1987
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1988
|
-
int32_t bsum = 0;
|
|
1989
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
1990
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
1991
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
1992
|
-
int32_t sumi = 0;
|
|
1993
|
-
for (int l = 0; l < 2; ++l) {
|
|
1994
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
1995
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
1996
|
-
for (int j = 0; j < 8; ++j) {
|
|
1997
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1998
|
-
}
|
|
1999
|
-
q8 += 8;
|
|
2000
|
-
}
|
|
2001
|
-
bsum += sumi * ls1;
|
|
2002
|
-
sumi = 0;
|
|
2003
|
-
for (int l = 2; l < 4; ++l) {
|
|
2004
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
2005
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
2006
|
-
for (int j = 0; j < 8; ++j) {
|
|
2007
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
2008
|
-
}
|
|
2009
|
-
q8 += 8;
|
|
2010
|
-
}
|
|
2011
|
-
bsum += sumi * ls2;
|
|
2012
|
-
q2 += 4;
|
|
2013
|
-
}
|
|
2014
|
-
sumf += d * bsum;
|
|
2015
|
-
}
|
|
2016
|
-
*s = 0.125f * sumf;
|
|
1687
|
+
UNUSED(x);
|
|
1688
|
+
UNUSED(y);
|
|
1689
|
+
UNUSED(nb);
|
|
1690
|
+
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2017
1691
|
#endif
|
|
2018
1692
|
}
|
|
2019
1693
|
|
|
@@ -2105,47 +1779,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2105
1779
|
*s = 0.125f * hsum_float_8(accumf);
|
|
2106
1780
|
|
|
2107
1781
|
#else
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2113
|
-
const int8_t * q8 = y[i].qs;
|
|
2114
|
-
const uint8_t * qs = x[i].qs;
|
|
2115
|
-
const uint8_t * qh = x[i].qh;
|
|
2116
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
2117
|
-
|
|
2118
|
-
int bsum = 0;
|
|
2119
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2120
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
2121
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
2122
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2123
|
-
for (int l = 0; l < 2; ++l) {
|
|
2124
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
2125
|
-
for (int j = 0; j < 8; ++j) {
|
|
2126
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
2127
|
-
}
|
|
2128
|
-
q8 += 8;
|
|
2129
|
-
}
|
|
2130
|
-
for (int l = 2; l < 4; ++l) {
|
|
2131
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
2132
|
-
for (int j = 0; j < 8; ++j) {
|
|
2133
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
2134
|
-
}
|
|
2135
|
-
q8 += 8;
|
|
2136
|
-
}
|
|
2137
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
2138
|
-
qs += 4;
|
|
2139
|
-
signs += 4;
|
|
2140
|
-
}
|
|
2141
|
-
|
|
2142
|
-
sumf += d * bsum;
|
|
2143
|
-
}
|
|
2144
|
-
|
|
2145
|
-
*s = 0.125f * sumf;
|
|
2146
|
-
|
|
1782
|
+
UNUSED(x);
|
|
1783
|
+
UNUSED(y);
|
|
1784
|
+
UNUSED(nb);
|
|
1785
|
+
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2147
1786
|
#endif
|
|
2148
|
-
|
|
2149
1787
|
}
|
|
2150
1788
|
|
|
2151
1789
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2209,36 +1847,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2209
1847
|
*s = 0.25f * hsum_float_8(accumf);
|
|
2210
1848
|
|
|
2211
1849
|
#else
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
for (int i = 0; i < nb; ++i) {
|
|
2217
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2218
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2219
|
-
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
2220
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2221
|
-
int32_t bsum = 0;
|
|
2222
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2223
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
2224
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
2225
|
-
int32_t sumi = 0;
|
|
2226
|
-
for (int l = 0; l < 4; ++l) {
|
|
2227
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
2228
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
2229
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
2230
|
-
for (int j = 0; j < 4; ++j) {
|
|
2231
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2232
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2233
|
-
}
|
|
2234
|
-
q8 += 8;
|
|
2235
|
-
}
|
|
2236
|
-
q3 += 8;
|
|
2237
|
-
bsum += sumi * ls;
|
|
2238
|
-
}
|
|
2239
|
-
sumf += d * bsum;
|
|
2240
|
-
}
|
|
2241
|
-
*s = 0.25f * sumf;
|
|
1850
|
+
UNUSED(x);
|
|
1851
|
+
UNUSED(y);
|
|
1852
|
+
UNUSED(nb);
|
|
1853
|
+
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2242
1854
|
#endif
|
|
2243
1855
|
}
|
|
2244
1856
|
|
|
@@ -2338,48 +1950,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2338
1950
|
*s = hsum_float_8(accumf);
|
|
2339
1951
|
|
|
2340
1952
|
#else
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2346
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2347
|
-
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
2348
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2349
|
-
int32_t bsum = 0;
|
|
2350
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
2351
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
2352
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
2353
|
-
int32_t sumi = 0;
|
|
2354
|
-
for (int l = 0; l < 4; ++l) {
|
|
2355
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
2356
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
2357
|
-
for (int j = 0; j < 4; ++j) {
|
|
2358
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2359
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2360
|
-
}
|
|
2361
|
-
q8 += 8;
|
|
2362
|
-
}
|
|
2363
|
-
qs += 8;
|
|
2364
|
-
signs += 4;
|
|
2365
|
-
bsum += sumi * ls1;
|
|
2366
|
-
sumi = 0;
|
|
2367
|
-
for (int l = 0; l < 4; ++l) {
|
|
2368
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
2369
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
2370
|
-
for (int j = 0; j < 4; ++j) {
|
|
2371
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2372
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2373
|
-
}
|
|
2374
|
-
q8 += 8;
|
|
2375
|
-
}
|
|
2376
|
-
qs += 8;
|
|
2377
|
-
signs += 4;
|
|
2378
|
-
bsum += sumi * ls2;
|
|
2379
|
-
}
|
|
2380
|
-
sumf += d * bsum;
|
|
2381
|
-
}
|
|
2382
|
-
*s = sumf;
|
|
1953
|
+
UNUSED(x);
|
|
1954
|
+
UNUSED(y);
|
|
1955
|
+
UNUSED(nb);
|
|
1956
|
+
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2383
1957
|
#endif
|
|
2384
1958
|
}
|
|
2385
1959
|
|
|
@@ -2460,36 +2034,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2460
2034
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
|
2461
2035
|
|
|
2462
2036
|
#else
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
const int8_t * q8 = y[i].qs;
|
|
2468
|
-
const uint8_t * qs = x[i].qs;
|
|
2469
|
-
const uint16_t * qh = x[i].qh;
|
|
2470
|
-
|
|
2471
|
-
int sumi = 0, sumi1 = 0;
|
|
2472
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
2473
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
2474
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
2475
|
-
int lsum = 0;
|
|
2476
|
-
for (int l = 0; l < 4; ++l) {
|
|
2477
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
2478
|
-
for (int j = 0; j < 8; ++j) {
|
|
2479
|
-
lsum += q8[j] * grid[j];
|
|
2480
|
-
}
|
|
2481
|
-
q8 += 8;
|
|
2482
|
-
}
|
|
2483
|
-
sumi += ls * lsum;
|
|
2484
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
2485
|
-
qs += 4;
|
|
2486
|
-
}
|
|
2487
|
-
|
|
2488
|
-
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
2489
|
-
}
|
|
2490
|
-
|
|
2491
|
-
*s = sumf;
|
|
2492
|
-
|
|
2037
|
+
UNUSED(x);
|
|
2038
|
+
UNUSED(y);
|
|
2039
|
+
UNUSED(nb);
|
|
2040
|
+
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2493
2041
|
#endif
|
|
2494
2042
|
}
|
|
2495
2043
|
|
|
@@ -2603,37 +2151,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2603
2151
|
*s = hsum_float_8(accum);
|
|
2604
2152
|
|
|
2605
2153
|
#else
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
const uint8_t * qs = x[ibl].qs;
|
|
2611
|
-
const int8_t * q8 = y[ibl].qs;
|
|
2612
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
2613
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
2614
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
2615
|
-
h >>= 4;
|
|
2616
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
2617
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
2618
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2619
|
-
for (int j = 0; j < 16; ++j) {
|
|
2620
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
2621
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
2622
|
-
}
|
|
2623
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
2624
|
-
qs += 16;
|
|
2625
|
-
q8 += 32;
|
|
2626
|
-
sumi1 = sumi2 = 0;
|
|
2627
|
-
for (int j = 0; j < 16; ++j) {
|
|
2628
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
2629
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
2630
|
-
}
|
|
2631
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
2632
|
-
qs += 16;
|
|
2633
|
-
q8 += 32;
|
|
2634
|
-
}
|
|
2635
|
-
}
|
|
2636
|
-
*s = sumf;
|
|
2154
|
+
UNUSED(x);
|
|
2155
|
+
UNUSED(y);
|
|
2156
|
+
UNUSED(nb);
|
|
2157
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2637
2158
|
#endif
|
|
2638
2159
|
}
|
|
2639
2160
|
|