@fugood/llama.node 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -702,7 +702,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
702
702
  const block_q8_1 * GGML_RESTRICT y = vy;
703
703
 
704
704
  int ib = 0;
705
- float sumf = 0;
706
705
 
707
706
  #if defined(__AVX2__) || defined(__AVX__)
708
707
  // Initialize accumulator with zeros
@@ -737,26 +736,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
737
736
  #endif
738
737
  }
739
738
 
740
- sumf = hsum_float_8(acc) + summs;
741
-
739
+ *s = hsum_float_8(acc) + summs;
740
+ #else
741
+ UNUSED(nb);
742
+ UNUSED(x);
743
+ UNUSED(y);
744
+ UNUSED(ib);
745
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
742
746
  #endif
743
- for (; ib < nb; ++ib) {
744
- int sumi0 = 0;
745
- int sumi1 = 0;
746
-
747
- for (int j = 0; j < qk/2; ++j) {
748
- const int v0 = (x[ib].qs[j] & 0x0F);
749
- const int v1 = (x[ib].qs[j] >> 4);
750
-
751
- sumi0 += (v0 * y[ib].qs[j]);
752
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
753
- }
754
-
755
- int sumi = sumi0 + sumi1;
756
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
757
- }
758
-
759
- *s = sumf;
760
747
  }
761
748
 
762
749
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -764,7 +751,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
764
751
  const int nb = n / qk;
765
752
 
766
753
  int ib = 0;
767
- float sumf = 0;
768
754
 
769
755
  assert(n % qk == 0);
770
756
  assert(qk == QK5_0);
@@ -799,7 +785,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
799
785
  acc = _mm256_fmadd_ps(d, q, acc);
800
786
  }
801
787
 
802
- sumf = hsum_float_8(acc);
788
+ *s = hsum_float_8(acc);
803
789
  #elif defined(__AVX__)
804
790
  // Initialize accumulator with zeros
805
791
  __m256 acc = _mm256_setzero_ps();
@@ -830,32 +816,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
830
816
  acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
831
817
  }
832
818
 
833
- sumf = hsum_float_8(acc);
834
-
819
+ *s = hsum_float_8(acc);
820
+ #else
821
+ UNUSED(nb);
822
+ UNUSED(ib);
823
+ UNUSED(x);
824
+ UNUSED(y);
825
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
835
826
  #endif
836
- for (; ib < nb; ++ib) {
837
- uint32_t qh;
838
- memcpy(&qh, x[ib].qh, sizeof(qh));
839
-
840
- int sumi0 = 0;
841
- int sumi1 = 0;
842
-
843
- for (int j = 0; j < qk/2; ++j) {
844
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
845
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
846
-
847
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
848
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
849
-
850
- sumi0 += (x0 * y[ib].qs[j]);
851
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
852
- }
853
-
854
- int sumi = sumi0 + sumi1;
855
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
856
- }
857
-
858
- *s = sumf;
859
827
  }
860
828
 
861
829
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -863,7 +831,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
863
831
  const int nb = n / qk;
864
832
 
865
833
  int ib = 0;
866
- float sumf = 0;
867
834
 
868
835
  assert(n % qk == 0);
869
836
  assert(qk == QK5_1);
@@ -901,7 +868,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
901
868
  acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
902
869
  }
903
870
 
904
- sumf = hsum_float_8(acc) + summs;
871
+ *s = hsum_float_8(acc) + summs;
905
872
  #elif defined(__AVX__)
906
873
  // Initialize accumulator with zeros
907
874
  __m256 acc = _mm256_setzero_ps();
@@ -935,32 +902,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
935
902
  acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
936
903
  }
937
904
 
938
- sumf = hsum_float_8(acc) + summs;
939
-
905
+ *s = hsum_float_8(acc) + summs;
906
+ #else
907
+ UNUSED(nb);
908
+ UNUSED(ib);
909
+ UNUSED(x);
910
+ UNUSED(y);
911
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
940
912
  #endif
941
- for (; ib < nb; ++ib) {
942
- uint32_t qh;
943
- memcpy(&qh, x[ib].qh, sizeof(qh));
944
-
945
- int sumi0 = 0;
946
- int sumi1 = 0;
947
-
948
- for (int j = 0; j < qk/2; ++j) {
949
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
950
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
951
-
952
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
953
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
954
-
955
- sumi0 += (x0 * y[ib].qs[j]);
956
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
957
- }
958
-
959
- int sumi = sumi0 + sumi1;
960
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
961
- }
962
-
963
- *s = sumf;
964
913
  }
965
914
 
966
915
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1017,7 +966,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1017
966
  }
1018
967
 
1019
968
  sumf = hsum_float_8(accum);
1020
-
1021
969
  #endif
1022
970
  for (; ib < nb; ++ib) {
1023
971
  int sumi = 0;
@@ -1157,44 +1105,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1157
1105
  *s = hsum_float_8(sumf);
1158
1106
 
1159
1107
  #else
1160
- const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
1161
-
1162
- float sumf = 0.0f;
1163
-
1164
- for (int i = 0; i < nb; ++i) {
1165
- int sum = 0;
1166
-
1167
- for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
1168
- for (size_t l = 0; l < 5; ++l) {
1169
- for (size_t m = 0; m < 32; ++m) {
1170
- uint8_t q = x[i].qs[j + m] * pow3[l];
1171
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1172
- sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
1173
- }
1174
- }
1175
- }
1176
- for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
1177
- for (size_t l = 0; l < 5; ++l) {
1178
- for (size_t m = 0; m < 16; ++m) {
1179
- uint8_t q = x[i].qs[j + m] * pow3[l];
1180
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1181
- sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
1182
- }
1183
- }
1184
- }
1185
-
1186
- for (size_t l = 0; l < 4; ++l) {
1187
- for (size_t j = 0; j < sizeof(x->qh); ++j) {
1188
- uint8_t q = x[i].qh[j] * pow3[l];
1189
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1190
- sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
1191
- }
1192
- }
1193
-
1194
- sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
1195
- }
1196
-
1197
- *s = sumf;
1108
+ UNUSED(x);
1109
+ UNUSED(y);
1110
+ UNUSED(nb);
1111
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1198
1112
  #endif
1199
1113
  }
1200
1114
 
@@ -1257,25 +1171,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1257
1171
  *s = hsum_float_8(sumf);
1258
1172
 
1259
1173
  #else
1260
- float sumf = 0.0f;
1261
-
1262
- for (int i = 0; i < nb; ++i) {
1263
- int32_t sumi = 0;
1264
-
1265
- for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1266
- for (size_t l = 0; l < 4; ++l) {
1267
- for (size_t k = 0; k < 32; ++k) {
1268
- sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
1269
- }
1270
- }
1271
- }
1272
-
1273
- const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1274
-
1275
- sumf += (float) sumi * d;
1276
- }
1277
-
1278
- *s = sumf;
1174
+ UNUSED(x);
1175
+ UNUSED(y);
1176
+ UNUSED(nb);
1177
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1279
1178
  #endif
1280
1179
  }
1281
1180
 
@@ -1464,45 +1363,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1464
1363
  *s = hsum_float_8(acc);
1465
1364
 
1466
1365
  #else
1467
-
1468
- float sumf = 0;
1469
-
1470
- for (int i = 0; i < nb; ++i) {
1471
-
1472
- const uint8_t * q2 = x[i].qs;
1473
- const int8_t * q8 = y[i].qs;
1474
- const uint8_t * sc = x[i].scales;
1475
-
1476
- int summs = 0;
1477
- for (int j = 0; j < 16; ++j) {
1478
- summs += y[i].bsums[j] * (sc[j] >> 4);
1479
- }
1480
-
1481
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1482
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1483
-
1484
- int isum = 0;
1485
- int is = 0;
1486
- int d;
1487
- for (int k = 0; k < QK_K/128; ++k) {
1488
- int shift = 0;
1489
- for (int j = 0; j < 4; ++j) {
1490
- d = sc[is++] & 0xF;
1491
- int isuml = 0;
1492
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1493
- isum += d * isuml;
1494
- d = sc[is++] & 0xF;
1495
- isuml = 0;
1496
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1497
- isum += d * isuml;
1498
- shift += 2;
1499
- q8 += 32;
1500
- }
1501
- q2 += 32;
1502
- }
1503
- sumf += dall * isum - dmin * summs;
1504
- }
1505
- *s = sumf;
1366
+ UNUSED(x);
1367
+ UNUSED(y);
1368
+ UNUSED(nb);
1369
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1506
1370
  #endif
1507
1371
  }
1508
1372
 
@@ -1769,70 +1633,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1769
1633
  *s = hsum_float_8(acc);
1770
1634
 
1771
1635
  #else
1772
- // scalar version
1773
- // This function is written like this so the compiler can manage to vectorize most of it
1774
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1775
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1776
- // The ideal situation would be if we could just write the code once, and the compiler would
1777
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1778
- // write vectorized versions for AVX, ARM_NEON, etc.
1779
-
1780
- int8_t aux8[QK_K];
1781
- int16_t aux16[8];
1782
- float sums [8];
1783
- int32_t aux32[8];
1784
- memset(sums, 0, 8*sizeof(float));
1785
-
1786
- uint32_t auxs[4];
1787
- const int8_t * scales = (const int8_t*)auxs;
1788
-
1789
- float sumf = 0;
1790
- for (int i = 0; i < nb; ++i) {
1791
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1792
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1793
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1794
- memset(aux32, 0, 8*sizeof(int32_t));
1795
- int8_t * GGML_RESTRICT a = aux8;
1796
- uint8_t m = 1;
1797
- for (int j = 0; j < QK_K; j += 128) {
1798
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1799
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1800
- a += 32; m <<= 1;
1801
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1802
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1803
- a += 32; m <<= 1;
1804
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1805
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1806
- a += 32; m <<= 1;
1807
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1808
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1809
- a += 32; m <<= 1;
1810
- q3 += 32;
1811
- }
1812
- a = aux8;
1813
-
1814
- memcpy(auxs, x[i].scales, 12);
1815
- uint32_t tmp = auxs[2];
1816
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1817
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1818
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1819
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1820
- for (int j = 0; j < QK_K/16; ++j) {
1821
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1822
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1823
- q8 += 8; a += 8;
1824
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1825
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1826
- q8 += 8; a += 8;
1827
- }
1828
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1829
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1830
- }
1831
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1832
- *s = sumf;
1833
-
1636
+ UNUSED(kmask1);
1637
+ UNUSED(kmask2);
1638
+ UNUSED(x);
1639
+ UNUSED(y);
1640
+ UNUSED(nb);
1641
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1834
1642
  #endif
1835
-
1836
1643
  }
1837
1644
 
1838
1645
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2002,61 +1809,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2002
1809
  *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
2003
1810
 
2004
1811
  #else
2005
-
2006
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2007
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2008
-
2009
- int8_t aux8[QK_K];
2010
- int16_t aux16[8];
2011
- float sums [8];
2012
- int32_t aux32[8];
2013
- memset(sums, 0, 8*sizeof(float));
2014
-
2015
- float sumf = 0;
2016
- for (int i = 0; i < nb; ++i) {
2017
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2018
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2019
- memset(aux32, 0, 8*sizeof(int32_t));
2020
- int8_t * GGML_RESTRICT a = aux8;
2021
- for (int j = 0; j < QK_K/64; ++j) {
2022
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2023
- a += 32;
2024
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2025
- a += 32; q4 += 32;
2026
- }
2027
- memcpy(utmp, x[i].scales, 12);
2028
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2029
- const uint32_t uaux = utmp[1] & kmask1;
2030
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2031
- utmp[2] = uaux;
2032
- utmp[0] &= kmask1;
2033
-
2034
- int sumi = 0;
2035
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2036
- a = aux8;
2037
- int is = 0;
2038
- for (int j = 0; j < QK_K/32; ++j) {
2039
- int32_t scale = scales[is++];
2040
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2041
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2042
- q8 += 8; a += 8;
2043
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2044
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2045
- q8 += 8; a += 8;
2046
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2047
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2048
- q8 += 8; a += 8;
2049
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2050
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2051
- q8 += 8; a += 8;
2052
- }
2053
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2054
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2055
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2056
- sumf -= dmin * sumi;
2057
- }
2058
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2059
- *s = sumf;
1812
+ UNUSED(x);
1813
+ UNUSED(y);
1814
+ UNUSED(nb);
1815
+ UNUSED(kmask1);
1816
+ UNUSED(kmask2);
1817
+ UNUSED(kmask3);
1818
+ UNUSED(utmp);
1819
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2060
1820
  #endif
2061
1821
  }
2062
1822
 
@@ -2259,66 +2019,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2259
2019
  *s = hsum_float_8(acc) + summs;
2260
2020
 
2261
2021
  #else
2262
-
2263
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2264
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2265
-
2266
- int8_t aux8[QK_K];
2267
- int16_t aux16[8];
2268
- float sums [8];
2269
- int32_t aux32[8];
2270
- memset(sums, 0, 8*sizeof(float));
2271
-
2272
- float sumf = 0;
2273
- for (int i = 0; i < nb; ++i) {
2274
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2275
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
2276
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2277
- memset(aux32, 0, 8*sizeof(int32_t));
2278
- int8_t * GGML_RESTRICT a = aux8;
2279
- uint8_t m = 1;
2280
- for (int j = 0; j < QK_K/64; ++j) {
2281
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2282
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2283
- a += 32; m <<= 1;
2284
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2285
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2286
- a += 32; m <<= 1;
2287
- q4 += 32;
2288
- }
2289
- memcpy(utmp, x[i].scales, 12);
2290
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2291
- const uint32_t uaux = utmp[1] & kmask1;
2292
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2293
- utmp[2] = uaux;
2294
- utmp[0] &= kmask1;
2295
-
2296
- int sumi = 0;
2297
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2298
- a = aux8;
2299
- int is = 0;
2300
- for (int j = 0; j < QK_K/32; ++j) {
2301
- int32_t scale = scales[is++];
2302
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2303
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2304
- q8 += 8; a += 8;
2305
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2306
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2307
- q8 += 8; a += 8;
2308
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2309
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2310
- q8 += 8; a += 8;
2311
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2312
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2313
- q8 += 8; a += 8;
2314
- }
2315
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2316
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2317
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2318
- sumf -= dmin * sumi;
2319
- }
2320
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2321
- *s = sumf;
2022
+ UNUSED(x);
2023
+ UNUSED(y);
2024
+ UNUSED(nb);
2025
+ UNUSED(kmask1);
2026
+ UNUSED(kmask2);
2027
+ UNUSED(kmask3);
2028
+ UNUSED(utmp);
2029
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2322
2030
  #endif
2323
2031
  }
2324
2032
 
@@ -2520,47 +2228,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2520
2228
  *s = hsum_float_8(acc);
2521
2229
 
2522
2230
  #else
2523
-
2524
- int8_t aux8[QK_K];
2525
- int16_t aux16[8];
2526
- float sums [8];
2527
- int32_t aux32[8];
2528
- memset(sums, 0, 8*sizeof(float));
2529
-
2530
- float sumf = 0;
2531
- for (int i = 0; i < nb; ++i) {
2532
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2533
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2534
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2535
- memset(aux32, 0, 8*sizeof(int32_t));
2536
- int8_t * GGML_RESTRICT a = aux8;
2537
- for (int j = 0; j < QK_K; j += 128) {
2538
- for (int l = 0; l < 32; ++l) {
2539
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2540
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2541
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2542
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2543
- }
2544
- a += 128;
2545
- q4 += 64;
2546
- qh += 32;
2547
- }
2548
- a = aux8;
2549
- int is = 0;
2550
- for (int j = 0; j < QK_K/16; ++j) {
2551
- int scale = x[i].scales[is++];
2552
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2553
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2554
- q8 += 8; a += 8;
2555
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2556
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2557
- q8 += 8; a += 8;
2558
- }
2559
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2560
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2561
- }
2562
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2563
- *s = sumf;
2231
+ UNUSED(x);
2232
+ UNUSED(y);
2233
+ UNUSED(nb);
2234
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2564
2235
  #endif
2565
2236
  }
2566
2237
 
@@ -2712,34 +2383,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2712
2383
  *s = 0.125f * hsum_float_8(accumf);
2713
2384
 
2714
2385
  #else
2715
-
2716
- uint32_t aux32[2];
2717
- const uint8_t * aux8 = (const uint8_t *)aux32;
2718
-
2719
- float sumf = 0.f;
2720
- for (int i = 0; i < nb; ++i) {
2721
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2722
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2723
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2724
- int32_t bsum = 0;
2725
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2726
- memcpy(aux32, q2, 2*sizeof(uint32_t));
2727
- q2 += 4;
2728
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
2729
- int32_t sumi = 0;
2730
- for (int l = 0; l < 4; ++l) {
2731
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
2732
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
2733
- for (int j = 0; j < 8; ++j) {
2734
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
2735
- }
2736
- q8 += 8;
2737
- }
2738
- bsum += sumi * ls;
2739
- }
2740
- sumf += d * bsum;
2741
- }
2742
- *s = 0.125f * sumf;
2386
+ UNUSED(x);
2387
+ UNUSED(y);
2388
+ UNUSED(nb);
2389
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2743
2390
  #endif
2744
2391
  }
2745
2392
 
@@ -3033,42 +2680,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
3033
2680
  *s = 0.125f * hsum_float_8(accumf);
3034
2681
 
3035
2682
  #else
3036
-
3037
- float sumf = 0.f;
3038
- for (int i = 0; i < nb; ++i) {
3039
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3040
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3041
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
3042
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3043
- int32_t bsum = 0;
3044
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3045
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
3046
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
3047
- int32_t sumi = 0;
3048
- for (int l = 0; l < 2; ++l) {
3049
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3050
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3051
- for (int j = 0; j < 8; ++j) {
3052
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3053
- }
3054
- q8 += 8;
3055
- }
3056
- bsum += sumi * ls1;
3057
- sumi = 0;
3058
- for (int l = 2; l < 4; ++l) {
3059
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3060
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3061
- for (int j = 0; j < 8; ++j) {
3062
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3063
- }
3064
- q8 += 8;
3065
- }
3066
- bsum += sumi * ls2;
3067
- q2 += 4;
3068
- }
3069
- sumf += d * bsum;
3070
- }
3071
- *s = 0.125f * sumf;
2683
+ UNUSED(x);
2684
+ UNUSED(y);
2685
+ UNUSED(nb);
2686
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3072
2687
  #endif
3073
2688
  }
3074
2689
 
@@ -3250,47 +2865,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3250
2865
  *s = 0.125f * hsum_float_8(accumf);
3251
2866
 
3252
2867
  #else
3253
-
3254
- float sumf = 0;
3255
- for (int i = 0; i < nb; i++) {
3256
-
3257
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3258
- const int8_t * q8 = y[i].qs;
3259
- const uint8_t * qs = x[i].qs;
3260
- const uint8_t * qh = x[i].qh;
3261
- const uint8_t * signs = qs + QK_K/8;
3262
-
3263
- int bsum = 0;
3264
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3265
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
3266
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
3267
- int sumi1 = 0, sumi2 = 0;
3268
- for (int l = 0; l < 2; ++l) {
3269
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3270
- for (int j = 0; j < 8; ++j) {
3271
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3272
- }
3273
- q8 += 8;
3274
- }
3275
- for (int l = 2; l < 4; ++l) {
3276
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3277
- for (int j = 0; j < 8; ++j) {
3278
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3279
- }
3280
- q8 += 8;
3281
- }
3282
- bsum += ls1 * sumi1 + ls2 * sumi2;
3283
- qs += 4;
3284
- signs += 4;
3285
- }
3286
-
3287
- sumf += d * bsum;
3288
- }
3289
-
3290
- *s = 0.125f * sumf;
3291
-
2868
+ UNUSED(x);
2869
+ UNUSED(y);
2870
+ UNUSED(nb);
2871
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3292
2872
  #endif
3293
-
3294
2873
  }
3295
2874
 
3296
2875
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -3410,36 +2989,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3410
2989
  *s = 0.25f * hsum_float_8(accumf);
3411
2990
 
3412
2991
  #else
3413
-
3414
- uint32_t aux32;
3415
-
3416
- float sumf = 0.f;
3417
- for (int i = 0; i < nb; ++i) {
3418
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3419
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3420
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3421
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3422
- int32_t bsum = 0;
3423
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3424
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
3425
- const uint32_t ls = 2*(aux32 >> 28) + 1;
3426
- int32_t sumi = 0;
3427
- for (int l = 0; l < 4; ++l) {
3428
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
3429
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
3430
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
3431
- for (int j = 0; j < 4; ++j) {
3432
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
3433
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
3434
- }
3435
- q8 += 8;
3436
- }
3437
- q3 += 8;
3438
- bsum += sumi * ls;
3439
- }
3440
- sumf += d * bsum;
3441
- }
3442
- *s = 0.25f * sumf;
2992
+ UNUSED(x);
2993
+ UNUSED(y);
2994
+ UNUSED(nb);
2995
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3443
2996
  #endif
3444
2997
  }
3445
2998
 
@@ -3646,48 +3199,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3646
3199
  *s = hsum_float_8(accumf);
3647
3200
 
3648
3201
  #else
3649
-
3650
- float sumf = 0.f;
3651
- for (int i = 0; i < nb; ++i) {
3652
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3653
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
3654
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
3655
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
3656
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3657
- int32_t bsum = 0;
3658
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3659
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
3660
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
3661
- int32_t sumi = 0;
3662
- for (int l = 0; l < 4; ++l) {
3663
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
3664
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
3665
- for (int j = 0; j < 4; ++j) {
3666
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3667
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3668
- }
3669
- q8 += 8;
3670
- }
3671
- qs += 8;
3672
- signs += 4;
3673
- bsum += sumi * ls1;
3674
- sumi = 0;
3675
- for (int l = 0; l < 4; ++l) {
3676
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
3677
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
3678
- for (int j = 0; j < 4; ++j) {
3679
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3680
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3681
- }
3682
- q8 += 8;
3683
- }
3684
- qs += 8;
3685
- signs += 4;
3686
- bsum += sumi * ls2;
3687
- }
3688
- sumf += d * bsum;
3689
- }
3690
- *s = sumf;
3202
+ UNUSED(x);
3203
+ UNUSED(y);
3204
+ UNUSED(nb);
3205
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3691
3206
  #endif
3692
3207
  }
3693
3208
 
@@ -3811,36 +3326,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3811
3326
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
3812
3327
 
3813
3328
  #else
3814
-
3815
- float sumf = 0;
3816
- for (int i = 0; i < nb; i++) {
3817
-
3818
- const int8_t * q8 = y[i].qs;
3819
- const uint8_t * qs = x[i].qs;
3820
- const uint16_t * qh = x[i].qh;
3821
-
3822
- int sumi = 0, sumi1 = 0;
3823
- for (int ib = 0; ib < QK_K/32; ++ib) {
3824
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
3825
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
3826
- int lsum = 0;
3827
- for (int l = 0; l < 4; ++l) {
3828
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
3829
- for (int j = 0; j < 8; ++j) {
3830
- lsum += q8[j] * grid[j];
3831
- }
3832
- q8 += 8;
3833
- }
3834
- sumi += ls * lsum;
3835
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
3836
- qs += 4;
3837
- }
3838
-
3839
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3840
- }
3841
-
3842
- *s = sumf;
3843
-
3329
+ UNUSED(x);
3330
+ UNUSED(y);
3331
+ UNUSED(nb);
3332
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3844
3333
  #endif
3845
3334
  }
3846
3335
 
@@ -4043,52 +3532,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
4043
3532
  *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
4044
3533
 
4045
3534
  #else
4046
-
4047
- int sum1[2], sum2[2], delta[4];
4048
-
4049
- float sumf = 0;
4050
- for (int i = 0; i < nb; i++) {
4051
-
4052
- const int8_t * q8 = y[i].qs;
4053
- const uint8_t * qs = x[i].qs;
4054
- const uint8_t * qh = x[i].qh;
4055
- const uint16_t * sc = (const uint16_t *)x[i].scales;
4056
-
4057
- scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
4058
-
4059
- int sumi1 = 0, sumi2 = 0;
4060
- for (int ib = 0; ib < QK_K/32; ++ib) {
4061
- delta[0] = qh[0] & 0x08 ? -1 : 1;
4062
- delta[1] = qh[0] & 0x80 ? -1 : 1;
4063
- delta[2] = qh[1] & 0x08 ? -1 : 1;
4064
- delta[3] = qh[1] & 0x80 ? -1 : 1;
4065
- sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
4066
- for (int l = 0; l < 4; ++l) {
4067
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
4068
- int lsum1 = 0, lsum2 = 0;
4069
- for (int j = 0; j < 8; ++j) {
4070
- lsum1 += q8[j] * grid[j];
4071
- lsum2 += q8[j];
4072
- }
4073
- q8 += 8;
4074
- sum1[l/2] += lsum1;
4075
- sum2[l/2] += lsum2*delta[l];
4076
- }
4077
-
4078
- const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
4079
- const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
4080
-
4081
- sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
4082
- sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
4083
- qs += 4;
4084
- qh += 2;
4085
- }
4086
-
4087
- sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
4088
- }
4089
-
4090
- *s = sumf;
4091
-
3535
+ UNUSED(x);
3536
+ UNUSED(y);
3537
+ UNUSED(nb);
3538
+ UNUSED(scale);
3539
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4092
3540
  #endif
4093
3541
  }
4094
3542
 
@@ -4275,37 +3723,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4275
3723
  *s = hsum_float_8(accum);
4276
3724
 
4277
3725
  #else
4278
- float sumf = 0;
4279
- for (int ibl = 0; ibl < nb; ++ibl) {
4280
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4281
- uint16_t h = x[ibl].scales_h;
4282
- const uint8_t * qs = x[ibl].qs;
4283
- const int8_t * q8 = y[ibl].qs;
4284
- for (int ib = 0; ib < QK_K/32; ib += 2) {
4285
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
4286
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
4287
- h >>= 4;
4288
- const float d1 = d4d8*(ls1 - 32);
4289
- const float d2 = d4d8*(ls2 - 32);
4290
- int sumi1 = 0, sumi2 = 0;
4291
- for (int j = 0; j < 16; ++j) {
4292
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4293
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4294
- }
4295
- sumf += d1 * (sumi1 + sumi2);
4296
- qs += 16;
4297
- q8 += 32;
4298
- sumi1 = sumi2 = 0;
4299
- for (int j = 0; j < 16; ++j) {
4300
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4301
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4302
- }
4303
- sumf += d2 * (sumi1 + sumi2);
4304
- qs += 16;
4305
- q8 += 32;
4306
- }
4307
- }
4308
- *s = sumf;
3726
+ UNUSED(x);
3727
+ UNUSED(y);
3728
+ UNUSED(nb);
3729
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4309
3730
  #endif
4310
3731
  }
4311
3732