@fugood/llama.node 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -821,24 +821,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
821
821
 
822
822
  sumf = hsum_float_8(acc) + summs;
823
823
 
824
- #endif
825
- for (; ib < nb; ++ib) {
826
- int sumi0 = 0;
827
- int sumi1 = 0;
828
-
829
- for (int j = 0; j < qk/2; ++j) {
830
- const int v0 = (x[ib].qs[j] & 0x0F);
831
- const int v1 = (x[ib].qs[j] >> 4);
832
-
833
- sumi0 += (v0 * y[ib].qs[j]);
834
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
835
- }
836
-
837
- int sumi = sumi0 + sumi1;
838
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
839
- }
840
-
841
824
  *s = sumf;
825
+ #else
826
+ UNUSED(nb);
827
+ UNUSED(x);
828
+ UNUSED(y);
829
+ UNUSED(ib);
830
+ UNUSED(sumf);
831
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
832
+ #endif
842
833
  }
843
834
 
844
835
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -883,30 +874,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
883
874
 
884
875
  sumf = hsum_float_8(acc);
885
876
 
886
- #endif
887
- for (; ib < nb; ++ib) {
888
- uint32_t qh;
889
- memcpy(&qh, x[ib].qh, sizeof(qh));
890
-
891
- int sumi0 = 0;
892
- int sumi1 = 0;
893
-
894
- for (int j = 0; j < qk/2; ++j) {
895
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
896
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
897
-
898
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
899
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
900
-
901
- sumi0 += (x0 * y[ib].qs[j]);
902
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
903
- }
904
-
905
- int sumi = sumi0 + sumi1;
906
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
907
- }
908
-
909
877
  *s = sumf;
878
+ #else
879
+ UNUSED(nb);
880
+ UNUSED(ib);
881
+ UNUSED(sumf);
882
+ UNUSED(x);
883
+ UNUSED(y);
884
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
885
+ #endif
910
886
  }
911
887
 
912
888
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -954,30 +930,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
954
930
 
955
931
  sumf = hsum_float_8(acc) + summs;
956
932
 
957
- #endif
958
- for (; ib < nb; ++ib) {
959
- uint32_t qh;
960
- memcpy(&qh, x[ib].qh, sizeof(qh));
961
-
962
- int sumi0 = 0;
963
- int sumi1 = 0;
964
-
965
- for (int j = 0; j < qk/2; ++j) {
966
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
967
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
968
-
969
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
970
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
971
-
972
- sumi0 += (x0 * y[ib].qs[j]);
973
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
974
- }
975
-
976
- int sumi = sumi0 + sumi1;
977
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
978
- }
979
-
980
933
  *s = sumf;
934
+ #else
935
+ UNUSED(nb);
936
+ UNUSED(ib);
937
+ UNUSED(sumf);
938
+ UNUSED(x);
939
+ UNUSED(y);
940
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
941
+ #endif
981
942
  }
982
943
 
983
944
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1016,18 +977,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1016
977
 
1017
978
  sumf = hsum_float_8(acc);
1018
979
 
1019
- #endif
1020
- for (; ib < nb; ++ib) {
1021
- int sumi = 0;
1022
-
1023
- for (int j = 0; j < qk; j++) {
1024
- sumi += x[ib].qs[j]*y[ib].qs[j];
1025
- }
1026
-
1027
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1028
- }
1029
-
1030
980
  *s = sumf;
981
+ #else
982
+ UNUSED(nb);
983
+ UNUSED(ib);
984
+ UNUSED(sumf);
985
+ UNUSED(x);
986
+ UNUSED(y);
987
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
988
+ #endif
1031
989
  }
1032
990
 
1033
991
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1103,45 +1061,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1103
1061
  *s = hsum_float_8(acc);
1104
1062
 
1105
1063
  #else
1106
-
1107
- float sumf = 0;
1108
-
1109
- for (int i = 0; i < nb; ++i) {
1110
-
1111
- const uint8_t * q2 = x[i].qs;
1112
- const int8_t * q8 = y[i].qs;
1113
- const uint8_t * sc = x[i].scales;
1114
-
1115
- int summs = 0;
1116
- for (int j = 0; j < 16; ++j) {
1117
- summs += y[i].bsums[j] * (sc[j] >> 4);
1118
- }
1119
-
1120
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1121
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1122
-
1123
- int isum = 0;
1124
- int is = 0;
1125
- int d;
1126
- for (int k = 0; k < QK_K/128; ++k) {
1127
- int shift = 0;
1128
- for (int j = 0; j < 4; ++j) {
1129
- d = sc[is++] & 0xF;
1130
- int isuml = 0;
1131
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1132
- isum += d * isuml;
1133
- d = sc[is++] & 0xF;
1134
- isuml = 0;
1135
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1136
- isum += d * isuml;
1137
- shift += 2;
1138
- q8 += 32;
1139
- }
1140
- q2 += 32;
1141
- }
1142
- sumf += dall * isum - dmin * summs;
1143
- }
1144
- *s = sumf;
1064
+ UNUSED(x);
1065
+ UNUSED(y);
1066
+ UNUSED(nb);
1067
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1145
1068
  #endif
1146
1069
  }
1147
1070
 
@@ -1239,70 +1162,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1239
1162
  *s = hsum_float_8(acc);
1240
1163
 
1241
1164
  #else
1242
- // scalar version
1243
- // This function is written like this so the compiler can manage to vectorize most of it
1244
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1245
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1246
- // The ideal situation would be if we could just write the code once, and the compiler would
1247
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1248
- // write vectorized versions for AVX, ARM_NEON, etc.
1249
-
1250
- int8_t aux8[QK_K];
1251
- int16_t aux16[8];
1252
- float sums [8];
1253
- int32_t aux32[8];
1254
- memset(sums, 0, 8*sizeof(float));
1255
-
1256
- uint32_t auxs[4];
1257
- const int8_t * scales = (const int8_t*)auxs;
1258
-
1259
- float sumf = 0;
1260
- for (int i = 0; i < nb; ++i) {
1261
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1262
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1263
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1264
- memset(aux32, 0, 8*sizeof(int32_t));
1265
- int8_t * GGML_RESTRICT a = aux8;
1266
- uint8_t m = 1;
1267
- for (int j = 0; j < QK_K; j += 128) {
1268
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1269
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1270
- a += 32; m <<= 1;
1271
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1272
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1273
- a += 32; m <<= 1;
1274
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1275
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1276
- a += 32; m <<= 1;
1277
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1278
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1279
- a += 32; m <<= 1;
1280
- q3 += 32;
1281
- }
1282
- a = aux8;
1283
-
1284
- memcpy(auxs, x[i].scales, 12);
1285
- uint32_t tmp = auxs[2];
1286
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1287
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1288
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1289
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1290
- for (int j = 0; j < QK_K/16; ++j) {
1291
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1292
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1293
- q8 += 8; a += 8;
1294
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1295
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1296
- q8 += 8; a += 8;
1297
- }
1298
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1299
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1300
- }
1301
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1302
- *s = sumf;
1303
-
1165
+ UNUSED(kmask1);
1166
+ UNUSED(kmask2);
1167
+ UNUSED(x);
1168
+ UNUSED(y);
1169
+ UNUSED(nb);
1170
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1304
1171
  #endif
1305
-
1306
1172
  }
1307
1173
 
1308
1174
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1391,61 +1257,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1391
1257
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
1392
1258
 
1393
1259
  #else
1394
-
1395
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1396
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1397
-
1398
- int8_t aux8[QK_K];
1399
- int16_t aux16[8];
1400
- float sums [8];
1401
- int32_t aux32[8];
1402
- memset(sums, 0, 8*sizeof(float));
1403
-
1404
- float sumf = 0;
1405
- for (int i = 0; i < nb; ++i) {
1406
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1407
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1408
- memset(aux32, 0, 8*sizeof(int32_t));
1409
- int8_t * GGML_RESTRICT a = aux8;
1410
- for (int j = 0; j < QK_K/64; ++j) {
1411
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1412
- a += 32;
1413
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1414
- a += 32; q4 += 32;
1415
- }
1416
- memcpy(utmp, x[i].scales, 12);
1417
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1418
- const uint32_t uaux = utmp[1] & kmask1;
1419
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1420
- utmp[2] = uaux;
1421
- utmp[0] &= kmask1;
1422
-
1423
- int sumi = 0;
1424
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1425
- a = aux8;
1426
- int is = 0;
1427
- for (int j = 0; j < QK_K/32; ++j) {
1428
- int32_t scale = scales[is++];
1429
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1430
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1431
- q8 += 8; a += 8;
1432
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1433
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1434
- q8 += 8; a += 8;
1435
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1436
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1437
- q8 += 8; a += 8;
1438
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1439
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1440
- q8 += 8; a += 8;
1441
- }
1442
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1443
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1444
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1445
- sumf -= dmin * sumi;
1446
- }
1447
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1448
- *s = sumf;
1260
+ UNUSED(x);
1261
+ UNUSED(y);
1262
+ UNUSED(nb);
1263
+ UNUSED(kmask1);
1264
+ UNUSED(kmask2);
1265
+ UNUSED(kmask3);
1266
+ UNUSED(utmp);
1267
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1449
1268
  #endif
1450
1269
  }
1451
1270
 
@@ -1541,66 +1360,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1541
1360
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
1542
1361
 
1543
1362
  #else
1544
-
1545
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1546
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1547
-
1548
- int8_t aux8[QK_K];
1549
- int16_t aux16[8];
1550
- float sums [8];
1551
- int32_t aux32[8];
1552
- memset(sums, 0, 8*sizeof(float));
1553
-
1554
- float sumf = 0;
1555
- for (int i = 0; i < nb; ++i) {
1556
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1557
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1558
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1559
- memset(aux32, 0, 8*sizeof(int32_t));
1560
- int8_t * GGML_RESTRICT a = aux8;
1561
- uint8_t m = 1;
1562
- for (int j = 0; j < QK_K/64; ++j) {
1563
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1564
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1565
- a += 32; m <<= 1;
1566
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1567
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1568
- a += 32; m <<= 1;
1569
- q4 += 32;
1570
- }
1571
- memcpy(utmp, x[i].scales, 12);
1572
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1573
- const uint32_t uaux = utmp[1] & kmask1;
1574
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1575
- utmp[2] = uaux;
1576
- utmp[0] &= kmask1;
1577
-
1578
- int sumi = 0;
1579
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1580
- a = aux8;
1581
- int is = 0;
1582
- for (int j = 0; j < QK_K/32; ++j) {
1583
- int32_t scale = scales[is++];
1584
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1585
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1586
- q8 += 8; a += 8;
1587
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1588
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1589
- q8 += 8; a += 8;
1590
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1591
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1592
- q8 += 8; a += 8;
1593
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1594
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1595
- q8 += 8; a += 8;
1596
- }
1597
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1598
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1599
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1600
- sumf -= dmin * sumi;
1601
- }
1602
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1603
- *s = sumf;
1363
+ UNUSED(x);
1364
+ UNUSED(y);
1365
+ UNUSED(nb);
1366
+ UNUSED(kmask1);
1367
+ UNUSED(kmask2);
1368
+ UNUSED(kmask3);
1369
+ UNUSED(utmp);
1370
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1604
1371
  #endif
1605
1372
  }
1606
1373
 
@@ -1678,47 +1445,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1678
1445
  *s = hsum_float_8(acc);
1679
1446
 
1680
1447
  #else
1681
-
1682
- int8_t aux8[QK_K];
1683
- int16_t aux16[8];
1684
- float sums [8];
1685
- int32_t aux32[8];
1686
- memset(sums, 0, 8*sizeof(float));
1687
-
1688
- float sumf = 0;
1689
- for (int i = 0; i < nb; ++i) {
1690
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1691
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
1692
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1693
- memset(aux32, 0, 8*sizeof(int32_t));
1694
- int8_t * GGML_RESTRICT a = aux8;
1695
- for (int j = 0; j < QK_K; j += 128) {
1696
- for (int l = 0; l < 32; ++l) {
1697
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1698
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1699
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1700
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1701
- }
1702
- a += 128;
1703
- q4 += 64;
1704
- qh += 32;
1705
- }
1706
- a = aux8;
1707
- int is = 0;
1708
- for (int j = 0; j < QK_K/16; ++j) {
1709
- int scale = x[i].scales[is++];
1710
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1711
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1712
- q8 += 8; a += 8;
1713
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1714
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1715
- q8 += 8; a += 8;
1716
- }
1717
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1718
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1719
- }
1720
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1721
- *s = sumf;
1448
+ UNUSED(x);
1449
+ UNUSED(y);
1450
+ UNUSED(nb);
1451
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1722
1452
  #endif
1723
1453
  }
1724
1454
 
@@ -1815,34 +1545,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1815
1545
  *s = 0.125f * hsum_float_8(accumf);
1816
1546
 
1817
1547
  #else
1818
-
1819
- uint32_t aux32[2];
1820
- const uint8_t * aux8 = (const uint8_t *)aux32;
1821
-
1822
- float sumf = 0.f;
1823
- for (int i = 0; i < nb; ++i) {
1824
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1825
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1826
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1827
- int32_t bsum = 0;
1828
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1829
- memcpy(aux32, q2, 2*sizeof(uint32_t));
1830
- q2 += 4;
1831
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1832
- int32_t sumi = 0;
1833
- for (int l = 0; l < 4; ++l) {
1834
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1835
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1836
- for (int j = 0; j < 8; ++j) {
1837
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1838
- }
1839
- q8 += 8;
1840
- }
1841
- bsum += sumi * ls;
1842
- }
1843
- sumf += d * bsum;
1844
- }
1845
- *s = 0.125f * sumf;
1548
+ UNUSED(x);
1549
+ UNUSED(y);
1550
+ UNUSED(nb);
1551
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1846
1552
  #endif
1847
1553
  }
1848
1554
 
@@ -1978,42 +1684,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1978
1684
  *s = 0.125f * hsum_float_8(accumf);
1979
1685
 
1980
1686
  #else
1981
-
1982
- float sumf = 0.f;
1983
- for (int i = 0; i < nb; ++i) {
1984
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1985
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1986
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
1987
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1988
- int32_t bsum = 0;
1989
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1990
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
1991
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
1992
- int32_t sumi = 0;
1993
- for (int l = 0; l < 2; ++l) {
1994
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1995
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1996
- for (int j = 0; j < 8; ++j) {
1997
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1998
- }
1999
- q8 += 8;
2000
- }
2001
- bsum += sumi * ls1;
2002
- sumi = 0;
2003
- for (int l = 2; l < 4; ++l) {
2004
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
2005
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
2006
- for (int j = 0; j < 8; ++j) {
2007
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
2008
- }
2009
- q8 += 8;
2010
- }
2011
- bsum += sumi * ls2;
2012
- q2 += 4;
2013
- }
2014
- sumf += d * bsum;
2015
- }
2016
- *s = 0.125f * sumf;
1687
+ UNUSED(x);
1688
+ UNUSED(y);
1689
+ UNUSED(nb);
1690
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2017
1691
  #endif
2018
1692
  }
2019
1693
 
@@ -2105,47 +1779,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2105
1779
  *s = 0.125f * hsum_float_8(accumf);
2106
1780
 
2107
1781
  #else
2108
-
2109
- float sumf = 0;
2110
- for (int i = 0; i < nb; i++) {
2111
-
2112
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2113
- const int8_t * q8 = y[i].qs;
2114
- const uint8_t * qs = x[i].qs;
2115
- const uint8_t * qh = x[i].qh;
2116
- const uint8_t * signs = qs + QK_K/8;
2117
-
2118
- int bsum = 0;
2119
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2120
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
2121
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
2122
- int sumi1 = 0, sumi2 = 0;
2123
- for (int l = 0; l < 2; ++l) {
2124
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2125
- for (int j = 0; j < 8; ++j) {
2126
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2127
- }
2128
- q8 += 8;
2129
- }
2130
- for (int l = 2; l < 4; ++l) {
2131
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2132
- for (int j = 0; j < 8; ++j) {
2133
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2134
- }
2135
- q8 += 8;
2136
- }
2137
- bsum += ls1 * sumi1 + ls2 * sumi2;
2138
- qs += 4;
2139
- signs += 4;
2140
- }
2141
-
2142
- sumf += d * bsum;
2143
- }
2144
-
2145
- *s = 0.125f * sumf;
2146
-
1782
+ UNUSED(x);
1783
+ UNUSED(y);
1784
+ UNUSED(nb);
1785
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2147
1786
  #endif
2148
-
2149
1787
  }
2150
1788
 
2151
1789
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2209,36 +1847,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2209
1847
  *s = 0.25f * hsum_float_8(accumf);
2210
1848
 
2211
1849
  #else
2212
-
2213
- uint32_t aux32;
2214
-
2215
- float sumf = 0.f;
2216
- for (int i = 0; i < nb; ++i) {
2217
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2218
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2219
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2220
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2221
- int32_t bsum = 0;
2222
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2223
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
2224
- const uint32_t ls = 2*(aux32 >> 28) + 1;
2225
- int32_t sumi = 0;
2226
- for (int l = 0; l < 4; ++l) {
2227
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
2228
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
2229
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
2230
- for (int j = 0; j < 4; ++j) {
2231
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
2232
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
2233
- }
2234
- q8 += 8;
2235
- }
2236
- q3 += 8;
2237
- bsum += sumi * ls;
2238
- }
2239
- sumf += d * bsum;
2240
- }
2241
- *s = 0.25f * sumf;
1850
+ UNUSED(x);
1851
+ UNUSED(y);
1852
+ UNUSED(nb);
1853
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2242
1854
  #endif
2243
1855
  }
2244
1856
 
@@ -2338,48 +1950,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2338
1950
  *s = hsum_float_8(accumf);
2339
1951
 
2340
1952
  #else
2341
-
2342
- float sumf = 0.f;
2343
- for (int i = 0; i < nb; ++i) {
2344
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2345
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
2346
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2347
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
2348
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2349
- int32_t bsum = 0;
2350
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2351
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
2352
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
2353
- int32_t sumi = 0;
2354
- for (int l = 0; l < 4; ++l) {
2355
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
2356
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
2357
- for (int j = 0; j < 4; ++j) {
2358
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2359
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2360
- }
2361
- q8 += 8;
2362
- }
2363
- qs += 8;
2364
- signs += 4;
2365
- bsum += sumi * ls1;
2366
- sumi = 0;
2367
- for (int l = 0; l < 4; ++l) {
2368
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
2369
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
2370
- for (int j = 0; j < 4; ++j) {
2371
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2372
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2373
- }
2374
- q8 += 8;
2375
- }
2376
- qs += 8;
2377
- signs += 4;
2378
- bsum += sumi * ls2;
2379
- }
2380
- sumf += d * bsum;
2381
- }
2382
- *s = sumf;
1953
+ UNUSED(x);
1954
+ UNUSED(y);
1955
+ UNUSED(nb);
1956
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2383
1957
  #endif
2384
1958
  }
2385
1959
 
@@ -2460,36 +2034,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2460
2034
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
2461
2035
 
2462
2036
  #else
2463
-
2464
- float sumf = 0;
2465
- for (int i = 0; i < nb; i++) {
2466
-
2467
- const int8_t * q8 = y[i].qs;
2468
- const uint8_t * qs = x[i].qs;
2469
- const uint16_t * qh = x[i].qh;
2470
-
2471
- int sumi = 0, sumi1 = 0;
2472
- for (int ib = 0; ib < QK_K/32; ++ib) {
2473
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
2474
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
2475
- int lsum = 0;
2476
- for (int l = 0; l < 4; ++l) {
2477
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
2478
- for (int j = 0; j < 8; ++j) {
2479
- lsum += q8[j] * grid[j];
2480
- }
2481
- q8 += 8;
2482
- }
2483
- sumi += ls * lsum;
2484
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
2485
- qs += 4;
2486
- }
2487
-
2488
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2489
- }
2490
-
2491
- *s = sumf;
2492
-
2037
+ UNUSED(x);
2038
+ UNUSED(y);
2039
+ UNUSED(nb);
2040
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2493
2041
  #endif
2494
2042
  }
2495
2043
 
@@ -2603,37 +2151,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2603
2151
  *s = hsum_float_8(accum);
2604
2152
 
2605
2153
  #else
2606
- float sumf = 0;
2607
- for (int ibl = 0; ibl < nb; ++ibl) {
2608
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2609
- uint16_t h = x[ibl].scales_h;
2610
- const uint8_t * qs = x[ibl].qs;
2611
- const int8_t * q8 = y[ibl].qs;
2612
- for (int ib = 0; ib < QK_K/32; ib += 2) {
2613
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
2614
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
2615
- h >>= 4;
2616
- const float d1 = d4d8*(ls1 - 32);
2617
- const float d2 = d4d8*(ls2 - 32);
2618
- int sumi1 = 0, sumi2 = 0;
2619
- for (int j = 0; j < 16; ++j) {
2620
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2621
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2622
- }
2623
- sumf += d1 * (sumi1 + sumi2);
2624
- qs += 16;
2625
- q8 += 32;
2626
- sumi1 = sumi2 = 0;
2627
- for (int j = 0; j < 16; ++j) {
2628
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2629
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2630
- }
2631
- sumf += d2 * (sumi1 + sumi2);
2632
- qs += 16;
2633
- q8 += 32;
2634
- }
2635
- }
2636
- *s = sumf;
2154
+ UNUSED(x);
2155
+ UNUSED(y);
2156
+ UNUSED(nb);
2157
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2637
2158
  #endif
2638
2159
  }
2639
2160