whisper.rn 0.5.0-rc.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/cpp/ggml-alloc.c +1 -15
  2. package/cpp/ggml-backend-reg.cpp +17 -8
  3. package/cpp/ggml-backend.cpp +15 -22
  4. package/cpp/ggml-common.h +17 -0
  5. package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
  6. package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
  7. package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
  8. package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  9. package/cpp/ggml-cpu/arch-fallback.h +34 -0
  10. package/cpp/ggml-cpu/ggml-cpu.c +22 -1
  11. package/cpp/ggml-cpu/ggml-cpu.cpp +21 -24
  12. package/cpp/ggml-cpu/ops.cpp +870 -211
  13. package/cpp/ggml-cpu/ops.h +3 -8
  14. package/cpp/ggml-cpu/quants.c +35 -0
  15. package/cpp/ggml-cpu/quants.h +8 -0
  16. package/cpp/ggml-cpu/repack.cpp +458 -47
  17. package/cpp/ggml-cpu/repack.h +22 -0
  18. package/cpp/ggml-cpu/simd-mappings.h +1 -1
  19. package/cpp/ggml-cpu/traits.cpp +2 -2
  20. package/cpp/ggml-cpu/traits.h +1 -1
  21. package/cpp/ggml-cpu/vec.cpp +12 -9
  22. package/cpp/ggml-cpu/vec.h +107 -13
  23. package/cpp/ggml-impl.h +77 -0
  24. package/cpp/ggml-metal-impl.h +51 -12
  25. package/cpp/ggml-metal.m +610 -115
  26. package/cpp/ggml-opt.cpp +97 -41
  27. package/cpp/ggml-opt.h +25 -6
  28. package/cpp/ggml-quants.c +110 -16
  29. package/cpp/ggml-quants.h +6 -0
  30. package/cpp/ggml-whisper-sim.metallib +0 -0
  31. package/cpp/ggml-whisper.metallib +0 -0
  32. package/cpp/ggml.c +314 -88
  33. package/cpp/ggml.h +137 -11
  34. package/cpp/gguf.cpp +8 -1
  35. package/cpp/jsi/RNWhisperJSI.cpp +23 -6
  36. package/cpp/whisper.cpp +15 -6
  37. package/ios/RNWhisper.mm +6 -6
  38. package/ios/RNWhisperContext.mm +2 -0
  39. package/ios/RNWhisperVadContext.mm +2 -0
  40. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
  41. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +77 -0
  42. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
  43. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  44. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  45. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +137 -11
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  48. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
  49. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +77 -0
  50. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
  51. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  52. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  53. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +137 -11
  54. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  55. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  56. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
  57. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +77 -0
  58. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
  59. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  60. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  61. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +137 -11
  62. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  63. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  64. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
  65. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +77 -0
  66. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
  67. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  68. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +137 -11
  70. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  71. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  72. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +28 -2
  73. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  74. package/lib/module/realtime-transcription/RealtimeTranscriber.js +28 -2
  75. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  76. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +1 -0
  77. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
  78. package/lib/typescript/realtime-transcription/types.d.ts +6 -0
  79. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
  80. package/package.json +1 -1
  81. package/src/realtime-transcription/RealtimeTranscriber.ts +32 -0
  82. package/src/realtime-transcription/types.ts +6 -0
@@ -589,6 +589,67 @@ void wsp_ggml_vec_dot_q4_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
589
589
  *s = sumf;
590
590
  }
591
591
 
592
+ void wsp_ggml_vec_dot_mxfp4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
593
+ assert(nrc == 1);
594
+ UNUSED(nrc);
595
+ UNUSED(bx);
596
+ UNUSED(by);
597
+ UNUSED(bs);
598
+ assert(n % QK_MXFP4 == 0);
599
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
600
+
601
+ const block_mxfp4 * WSP_GGML_RESTRICT x = vx;
602
+ const block_q8_0 * WSP_GGML_RESTRICT y = vy;
603
+
604
+ const int nb = n / QK_MXFP4;
605
+
606
+ int ib = 0;
607
+ float sumf = 0;
608
+
609
+ #if defined __ARM_NEON
610
+ const int8x16_t values = vld1q_s8(kvalues_mxfp4);
611
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
612
+ uint8x16x2_t q4bits;
613
+ int8x16x4_t q4b;
614
+ int8x16x4_t q8b;
615
+ int32x4_t prod_1;
616
+ int32x4_t prod_2;
617
+
618
+ for (; ib + 1 < nb; ib += 2) {
619
+ q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
620
+ q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
621
+ q8b.val[0] = vld1q_s8(y[ib + 0].qs);
622
+ q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
623
+ q8b.val[2] = vld1q_s8(y[ib + 1].qs);
624
+ q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
625
+
626
+ q4b.val[0] = wsp_ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
627
+ q4b.val[1] = wsp_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
628
+ q4b.val[2] = wsp_ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
629
+ q4b.val[3] = wsp_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
630
+
631
+ prod_1 = wsp_ggml_vdotq_s32(wsp_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
632
+ prod_2 = wsp_ggml_vdotq_s32(wsp_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
633
+
634
+ sumf +=
635
+ WSP_GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * WSP_GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
636
+ WSP_GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * WSP_GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
637
+ }
638
+
639
+ #endif
640
+ for (; ib < nb; ++ib) {
641
+ const float d = WSP_GGML_CPU_FP16_TO_FP32(y[ib].d)*WSP_GGML_E8M0_TO_FP32_HALF(x[ib].e);
642
+ int sumi1 = 0;
643
+ int sumi2 = 0;
644
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
645
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
646
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
647
+ }
648
+ sumf += d * (sumi1 + sumi2);
649
+ }
650
+ *s = sumf;
651
+ }
652
+
592
653
  void wsp_ggml_vec_dot_q5_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
593
654
  const int qk = QK8_0;
594
655
  const int nb = n / qk;
@@ -1236,44 +1297,10 @@ void wsp_ggml_vec_dot_tq1_0_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
1236
1297
  *s = sumf;
1237
1298
 
1238
1299
  #else
1239
- const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
1240
-
1241
- float sumf = 0.0f;
1242
-
1243
- for (int i = 0; i < nb; ++i) {
1244
- int sum = 0;
1245
-
1246
- for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
1247
- for (size_t l = 0; l < 5; ++l) {
1248
- for (size_t m = 0; m < 32; ++m) {
1249
- uint8_t q = x[i].qs[j + m] * pow3[l];
1250
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1251
- sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
1252
- }
1253
- }
1254
- }
1255
- for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
1256
- for (size_t l = 0; l < 5; ++l) {
1257
- for (size_t m = 0; m < 16; ++m) {
1258
- uint8_t q = x[i].qs[j + m] * pow3[l];
1259
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1260
- sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
1261
- }
1262
- }
1263
- }
1264
-
1265
- for (size_t l = 0; l < 4; ++l) {
1266
- for (size_t j = 0; j < sizeof(x->qh); ++j) {
1267
- uint8_t q = x[i].qh[j] * pow3[l];
1268
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1269
- sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
1270
- }
1271
- }
1272
-
1273
- sumf += (float) sum * (WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
1274
- }
1275
-
1276
- *s = sumf;
1300
+ UNUSED(x);
1301
+ UNUSED(y);
1302
+ UNUSED(nb);
1303
+ wsp_ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1277
1304
  #endif
1278
1305
  }
1279
1306
 
@@ -1381,25 +1408,10 @@ void wsp_ggml_vec_dot_tq2_0_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
1381
1408
  *s = sumf;
1382
1409
 
1383
1410
  #else
1384
- float sumf = 0.0f;
1385
-
1386
- for (int i = 0; i < nb; ++i) {
1387
- int32_t sumi = 0;
1388
-
1389
- for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1390
- for (size_t l = 0; l < 4; ++l) {
1391
- for (size_t k = 0; k < 32; ++k) {
1392
- sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
1393
- }
1394
- }
1395
- }
1396
-
1397
- const float d = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
1398
-
1399
- sumf += (float) sumi * d;
1400
- }
1401
-
1402
- *s = sumf;
1411
+ UNUSED(x);
1412
+ UNUSED(y);
1413
+ UNUSED(nb);
1414
+ wsp_ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1403
1415
  #endif
1404
1416
  }
1405
1417
 
@@ -1729,45 +1741,10 @@ void wsp_ggml_vec_dot_q2_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
1729
1741
  *s = sum;
1730
1742
 
1731
1743
  #else
1732
-
1733
- float sumf = 0;
1734
-
1735
- for (int i = 0; i < nb; ++i) {
1736
-
1737
- const uint8_t * q2 = x[i].qs;
1738
- const int8_t * q8 = y[i].qs;
1739
- const uint8_t * sc = x[i].scales;
1740
-
1741
- int summs = 0;
1742
- for (int j = 0; j < 16; ++j) {
1743
- summs += y[i].bsums[j] * (sc[j] >> 4);
1744
- }
1745
-
1746
- const float dall = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
1747
- const float dmin = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin);
1748
-
1749
- int isum = 0;
1750
- int is = 0;
1751
- int d;
1752
- for (int k = 0; k < QK_K/128; ++k) {
1753
- int shift = 0;
1754
- for (int j = 0; j < 4; ++j) {
1755
- d = sc[is++] & 0xF;
1756
- int isuml = 0;
1757
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1758
- isum += d * isuml;
1759
- d = sc[is++] & 0xF;
1760
- isuml = 0;
1761
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1762
- isum += d * isuml;
1763
- shift += 2;
1764
- q8 += 32;
1765
- }
1766
- q2 += 32;
1767
- }
1768
- sumf += dall * isum - dmin * summs;
1769
- }
1770
- *s = sumf;
1744
+ UNUSED(x);
1745
+ UNUSED(y);
1746
+ UNUSED(nb);
1747
+ wsp_ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1771
1748
  #endif
1772
1749
  }
1773
1750
 
@@ -2057,68 +2034,12 @@ void wsp_ggml_vec_dot_q3_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
2057
2034
  *s = sum;
2058
2035
 
2059
2036
  #else
2060
- // scalar version
2061
- // This function is written like this so the compiler can manage to vectorize most of it
2062
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
2063
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
2064
- // The ideal situation would be if we could just write the code once, and the compiler would
2065
- // automatically produce the best possible set of machine instructions, instead of us having to manually
2066
- // write vectorized versions for AVX, ARM_NEON, etc.
2067
-
2068
- int8_t aux8[QK_K];
2069
- int16_t aux16[8];
2070
- float sums [8];
2071
- int32_t aux32[8];
2072
- memset(sums, 0, 8*sizeof(float));
2073
-
2074
- uint32_t auxs[4];
2075
- const int8_t * scales = (const int8_t*)auxs;
2076
-
2077
- float sumf = 0;
2078
- for (int i = 0; i < nb; ++i) {
2079
- const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
2080
- const uint8_t * WSP_GGML_RESTRICT hm = x[i].hmask;
2081
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
2082
- memset(aux32, 0, 8*sizeof(int32_t));
2083
- int8_t * WSP_GGML_RESTRICT a = aux8;
2084
- uint8_t m = 1;
2085
- for (int j = 0; j < QK_K; j += 128) {
2086
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
2087
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2088
- a += 32; m <<= 1;
2089
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
2090
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2091
- a += 32; m <<= 1;
2092
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
2093
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2094
- a += 32; m <<= 1;
2095
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
2096
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2097
- a += 32; m <<= 1;
2098
- q3 += 32;
2099
- }
2100
- a = aux8;
2101
-
2102
- memcpy(auxs, x[i].scales, 12);
2103
- uint32_t tmp = auxs[2];
2104
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
2105
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
2106
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
2107
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
2108
- for (int j = 0; j < QK_K/16; ++j) {
2109
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2110
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
2111
- q8 += 8; a += 8;
2112
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2113
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
2114
- q8 += 8; a += 8;
2115
- }
2116
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2117
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2118
- }
2119
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2120
- *s = sumf;
2121
-
2037
+ UNUSED(kmask1);
2038
+ UNUSED(kmask2);
2039
+ UNUSED(x);
2040
+ UNUSED(y);
2041
+ UNUSED(nb);
2042
+ wsp_ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2122
2043
  #endif
2123
2044
 
2124
2045
  }
@@ -2431,61 +2352,14 @@ void wsp_ggml_vec_dot_q4_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
2431
2352
  *s = sumf;
2432
2353
 
2433
2354
  #else
2434
-
2435
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2436
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2437
-
2438
- int8_t aux8[QK_K];
2439
- int16_t aux16[8];
2440
- float sums [8];
2441
- int32_t aux32[8];
2442
- memset(sums, 0, 8*sizeof(float));
2443
-
2444
- float sumf = 0;
2445
- for (int i = 0; i < nb; ++i) {
2446
- const uint8_t * WSP_GGML_RESTRICT q4 = x[i].qs;
2447
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
2448
- memset(aux32, 0, 8*sizeof(int32_t));
2449
- int8_t * WSP_GGML_RESTRICT a = aux8;
2450
- for (int j = 0; j < QK_K/64; ++j) {
2451
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2452
- a += 32;
2453
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2454
- a += 32; q4 += 32;
2455
- }
2456
- memcpy(utmp, x[i].scales, 12);
2457
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2458
- const uint32_t uaux = utmp[1] & kmask1;
2459
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2460
- utmp[2] = uaux;
2461
- utmp[0] &= kmask1;
2462
-
2463
- int sumi = 0;
2464
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2465
- a = aux8;
2466
- int is = 0;
2467
- for (int j = 0; j < QK_K/32; ++j) {
2468
- int32_t scale = scales[is++];
2469
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2470
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2471
- q8 += 8; a += 8;
2472
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2473
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2474
- q8 += 8; a += 8;
2475
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2476
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2477
- q8 += 8; a += 8;
2478
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2479
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2480
- q8 += 8; a += 8;
2481
- }
2482
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2483
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2484
- const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2485
- sumf -= dmin * sumi;
2486
- }
2487
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2488
- *s = sumf;
2355
+ UNUSED(x);
2356
+ UNUSED(y);
2357
+ UNUSED(nb);
2358
+ UNUSED(kmask1);
2359
+ UNUSED(kmask2);
2360
+ UNUSED(kmask3);
2361
+ UNUSED(utmp);
2362
+ wsp_ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2489
2363
  #endif
2490
2364
  }
2491
2365
 
@@ -2578,66 +2452,14 @@ void wsp_ggml_vec_dot_q5_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
2578
2452
  *s = sumf;
2579
2453
 
2580
2454
  #else
2581
-
2582
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2583
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2584
-
2585
- int8_t aux8[QK_K];
2586
- int16_t aux16[8];
2587
- float sums [8];
2588
- int32_t aux32[8];
2589
- memset(sums, 0, 8*sizeof(float));
2590
-
2591
- float sumf = 0;
2592
- for (int i = 0; i < nb; ++i) {
2593
- const uint8_t * WSP_GGML_RESTRICT q4 = x[i].qs;
2594
- const uint8_t * WSP_GGML_RESTRICT hm = x[i].qh;
2595
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
2596
- memset(aux32, 0, 8*sizeof(int32_t));
2597
- int8_t * WSP_GGML_RESTRICT a = aux8;
2598
- uint8_t m = 1;
2599
- for (int j = 0; j < QK_K/64; ++j) {
2600
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2601
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2602
- a += 32; m <<= 1;
2603
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2604
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2605
- a += 32; m <<= 1;
2606
- q4 += 32;
2607
- }
2608
- memcpy(utmp, x[i].scales, 12);
2609
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2610
- const uint32_t uaux = utmp[1] & kmask1;
2611
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2612
- utmp[2] = uaux;
2613
- utmp[0] &= kmask1;
2614
-
2615
- int sumi = 0;
2616
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2617
- a = aux8;
2618
- int is = 0;
2619
- for (int j = 0; j < QK_K/32; ++j) {
2620
- int32_t scale = scales[is++];
2621
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2622
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2623
- q8 += 8; a += 8;
2624
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2625
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2626
- q8 += 8; a += 8;
2627
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2628
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2629
- q8 += 8; a += 8;
2630
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2631
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2632
- q8 += 8; a += 8;
2633
- }
2634
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2635
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2636
- const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2637
- sumf -= dmin * sumi;
2638
- }
2639
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2640
- *s = sumf;
2455
+ UNUSED(x);
2456
+ UNUSED(y);
2457
+ UNUSED(nb);
2458
+ UNUSED(kmask1);
2459
+ UNUSED(kmask2);
2460
+ UNUSED(kmask3);
2461
+ UNUSED(utmp);
2462
+ wsp_ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2641
2463
  #endif
2642
2464
  }
2643
2465
 
@@ -3093,47 +2915,10 @@ void wsp_ggml_vec_dot_q6_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
3093
2915
  }
3094
2916
  *s = sum;
3095
2917
  #else
3096
-
3097
- int8_t aux8[QK_K];
3098
- int16_t aux16[8];
3099
- float sums [8];
3100
- int32_t aux32[8];
3101
- memset(sums, 0, 8*sizeof(float));
3102
-
3103
- float sumf = 0;
3104
- for (int i = 0; i < nb; ++i) {
3105
- const uint8_t * WSP_GGML_RESTRICT q4 = x[i].ql;
3106
- const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
3107
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
3108
- memset(aux32, 0, 8*sizeof(int32_t));
3109
- int8_t * WSP_GGML_RESTRICT a = aux8;
3110
- for (int j = 0; j < QK_K; j += 128) {
3111
- for (int l = 0; l < 32; ++l) {
3112
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
3113
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
3114
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
3115
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
3116
- }
3117
- a += 128;
3118
- q4 += 64;
3119
- qh += 32;
3120
- }
3121
- a = aux8;
3122
- int is = 0;
3123
- for (int j = 0; j < QK_K/16; ++j) {
3124
- int scale = x[i].scales[is++];
3125
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
3126
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3127
- q8 += 8; a += 8;
3128
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
3129
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3130
- q8 += 8; a += 8;
3131
- }
3132
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3133
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
3134
- }
3135
- for (int l = 0; l < 8; ++l) sumf += sums[l];
3136
- *s = sumf;
2918
+ UNUSED(x);
2919
+ UNUSED(y);
2920
+ UNUSED(nb);
2921
+ wsp_ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3137
2922
  #endif
3138
2923
  }
3139
2924
 
@@ -3229,34 +3014,10 @@ void wsp_ggml_vec_dot_iq2_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs
3229
3014
  *s = 0.25f * sumf;
3230
3015
 
3231
3016
  #else
3232
-
3233
- uint32_t aux32[2];
3234
- const uint8_t * aux8 = (const uint8_t *)aux32;
3235
-
3236
- float sumf = 0.f;
3237
- for (int i = 0; i < nb; ++i) {
3238
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3239
- const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
3240
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
3241
- int32_t bsum = 0;
3242
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3243
- memcpy(aux32, q2, 2*sizeof(uint32_t));
3244
- q2 += 4;
3245
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
3246
- int32_t sumi = 0;
3247
- for (int l = 0; l < 4; ++l) {
3248
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
3249
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
3250
- for (int j = 0; j < 8; ++j) {
3251
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3252
- }
3253
- q8 += 8;
3254
- }
3255
- bsum += sumi * ls;
3256
- }
3257
- sumf += d * bsum;
3258
- }
3259
- *s = 0.125f * sumf;
3017
+ UNUSED(x);
3018
+ UNUSED(y);
3019
+ UNUSED(nb);
3020
+ wsp_ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3260
3021
  #endif
3261
3022
  }
3262
3023
 
@@ -3327,42 +3088,10 @@ void wsp_ggml_vec_dot_iq2_xs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3327
3088
  *s = 0.125f * sumf;
3328
3089
 
3329
3090
  #else
3330
-
3331
- float sumf = 0.f;
3332
- for (int i = 0; i < nb; ++i) {
3333
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3334
- const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
3335
- const uint8_t * WSP_GGML_RESTRICT sc = x[i].scales;
3336
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
3337
- int32_t bsum = 0;
3338
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3339
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
3340
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
3341
- int32_t sumi = 0;
3342
- for (int l = 0; l < 2; ++l) {
3343
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3344
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3345
- for (int j = 0; j < 8; ++j) {
3346
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3347
- }
3348
- q8 += 8;
3349
- }
3350
- bsum += sumi * ls1;
3351
- sumi = 0;
3352
- for (int l = 2; l < 4; ++l) {
3353
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3354
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3355
- for (int j = 0; j < 8; ++j) {
3356
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3357
- }
3358
- q8 += 8;
3359
- }
3360
- bsum += sumi * ls2;
3361
- q2 += 4;
3362
- }
3363
- sumf += d * bsum;
3364
- }
3365
- *s = 0.125f * sumf;
3091
+ UNUSED(x);
3092
+ UNUSED(y);
3093
+ UNUSED(nb);
3094
+ wsp_ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3366
3095
  #endif
3367
3096
  }
3368
3097
 
@@ -3455,45 +3184,10 @@ void wsp_ggml_vec_dot_iq2_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3455
3184
  *s = 0.125f * sumf;
3456
3185
 
3457
3186
  #else
3458
-
3459
- float sumf = 0;
3460
- for (int i = 0; i < nb; i++) {
3461
-
3462
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3463
- const int8_t * q8 = y[i].qs;
3464
- const uint8_t * qs = x[i].qs;
3465
- const uint8_t * qh = x[i].qh;
3466
- const uint8_t * signs = qs + QK_K/8;
3467
-
3468
- int bsum = 0;
3469
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3470
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
3471
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
3472
- int sumi1 = 0, sumi2 = 0;
3473
- for (int l = 0; l < 2; ++l) {
3474
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3475
- for (int j = 0; j < 8; ++j) {
3476
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3477
- }
3478
- q8 += 8;
3479
- }
3480
- for (int l = 2; l < 4; ++l) {
3481
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3482
- for (int j = 0; j < 8; ++j) {
3483
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3484
- }
3485
- q8 += 8;
3486
- }
3487
- bsum += ls1 * sumi1 + ls2 * sumi2;
3488
- qs += 4;
3489
- signs += 4;
3490
- }
3491
-
3492
- sumf += d * bsum;
3493
- }
3494
-
3495
- *s = 0.125f * sumf;
3496
-
3187
+ UNUSED(x);
3188
+ UNUSED(y);
3189
+ UNUSED(nb);
3190
+ wsp_ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3497
3191
  #endif
3498
3192
 
3499
3193
  }
@@ -3553,36 +3247,10 @@ void wsp_ggml_vec_dot_iq3_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs
3553
3247
  *s = 0.5f * sumf;
3554
3248
 
3555
3249
  #else
3556
-
3557
- uint32_t aux32;
3558
-
3559
- float sumf = 0.f;
3560
- for (int i = 0; i < nb; ++i) {
3561
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3562
- const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
3563
- const uint8_t * WSP_GGML_RESTRICT gas = x[i].qs + QK_K/4;
3564
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
3565
- int32_t bsum = 0;
3566
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3567
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
3568
- const uint32_t ls = 2*(aux32 >> 28) + 1;
3569
- int32_t sumi = 0;
3570
- for (int l = 0; l < 4; ++l) {
3571
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
3572
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
3573
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
3574
- for (int j = 0; j < 4; ++j) {
3575
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
3576
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
3577
- }
3578
- q8 += 8;
3579
- }
3580
- q3 += 8;
3581
- bsum += sumi * ls;
3582
- }
3583
- sumf += d * bsum;
3584
- }
3585
- *s = 0.25f * sumf;
3250
+ UNUSED(x);
3251
+ UNUSED(y);
3252
+ UNUSED(nb);
3253
+ wsp_ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3586
3254
  #endif
3587
3255
  }
3588
3256
 
@@ -3689,48 +3357,10 @@ void wsp_ggml_vec_dot_iq3_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3689
3357
  *s = sumf;
3690
3358
 
3691
3359
  #else
3692
-
3693
- float sumf = 0.f;
3694
- for (int i = 0; i < nb; ++i) {
3695
- const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3696
- const uint8_t * WSP_GGML_RESTRICT qs = x[i].qs;
3697
- const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
3698
- const uint8_t * WSP_GGML_RESTRICT signs = x[i].signs;
3699
- const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
3700
- int32_t bsum = 0;
3701
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3702
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
3703
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
3704
- int32_t sumi = 0;
3705
- for (int l = 0; l < 4; ++l) {
3706
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
3707
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
3708
- for (int j = 0; j < 4; ++j) {
3709
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3710
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3711
- }
3712
- q8 += 8;
3713
- }
3714
- qs += 8;
3715
- signs += 4;
3716
- bsum += sumi * ls1;
3717
- sumi = 0;
3718
- for (int l = 0; l < 4; ++l) {
3719
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
3720
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
3721
- for (int j = 0; j < 4; ++j) {
3722
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3723
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3724
- }
3725
- q8 += 8;
3726
- }
3727
- qs += 8;
3728
- signs += 4;
3729
- bsum += sumi * ls2;
3730
- }
3731
- sumf += d * bsum;
3732
- }
3733
- *s = sumf;
3360
+ UNUSED(x);
3361
+ UNUSED(y);
3362
+ UNUSED(nb);
3363
+ wsp_ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3734
3364
  #endif
3735
3365
  }
3736
3366
 
@@ -3793,36 +3423,10 @@ void wsp_ggml_vec_dot_iq1_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3793
3423
  *s = sumf;
3794
3424
 
3795
3425
  #else
3796
-
3797
- float sumf = 0;
3798
- for (int i = 0; i < nb; i++) {
3799
-
3800
- const int8_t * q8 = y[i].qs;
3801
- const uint8_t * qs = x[i].qs;
3802
- const uint16_t * qh = x[i].qh;
3803
-
3804
- int sumi = 0, sumi1 = 0;
3805
- for (int ib = 0; ib < QK_K/32; ++ib) {
3806
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
3807
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
3808
- int lsum = 0;
3809
- for (int l = 0; l < 4; ++l) {
3810
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
3811
- for (int j = 0; j < 8; ++j) {
3812
- lsum += q8[j] * grid[j];
3813
- }
3814
- q8 += 8;
3815
- }
3816
- sumi += ls * lsum;
3817
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
3818
- qs += 4;
3819
- }
3820
-
3821
- sumf += WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3822
- }
3823
-
3824
- *s = sumf;
3825
-
3426
+ UNUSED(x);
3427
+ UNUSED(y);
3428
+ UNUSED(nb);
3429
+ wsp_ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3826
3430
  #endif
3827
3431
  }
3828
3432
 
@@ -3912,52 +3516,11 @@ void wsp_ggml_vec_dot_iq1_m_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3912
3516
  *s = sumf;
3913
3517
 
3914
3518
  #else
3915
-
3916
- int sum1[2], sum2[2], delta[4];
3917
-
3918
- float sumf = 0;
3919
- for (int i = 0; i < nb; i++) {
3920
-
3921
- const int8_t * q8 = y[i].qs;
3922
- const uint8_t * qs = x[i].qs;
3923
- const uint8_t * qh = x[i].qh;
3924
- const uint16_t * sc = (const uint16_t *)x[i].scales;
3925
-
3926
- scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3927
-
3928
- int sumi1 = 0, sumi2 = 0;
3929
- for (int ib = 0; ib < QK_K/32; ++ib) {
3930
- delta[0] = qh[0] & 0x08 ? -1 : 1;
3931
- delta[1] = qh[0] & 0x80 ? -1 : 1;
3932
- delta[2] = qh[1] & 0x08 ? -1 : 1;
3933
- delta[3] = qh[1] & 0x80 ? -1 : 1;
3934
- sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
3935
- for (int l = 0; l < 4; ++l) {
3936
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
3937
- int lsum1 = 0, lsum2 = 0;
3938
- for (int j = 0; j < 8; ++j) {
3939
- lsum1 += q8[j] * grid[j];
3940
- lsum2 += q8[j];
3941
- }
3942
- q8 += 8;
3943
- sum1[l/2] += lsum1;
3944
- sum2[l/2] += lsum2*delta[l];
3945
- }
3946
-
3947
- const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
3948
- const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
3949
-
3950
- sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
3951
- sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
3952
- qs += 4;
3953
- qh += 2;
3954
- }
3955
-
3956
- sumf += WSP_GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
3957
- }
3958
-
3959
- *s = sumf;
3960
-
3519
+ UNUSED(x);
3520
+ UNUSED(y);
3521
+ UNUSED(nb);
3522
+ UNUSED(scale);
3523
+ wsp_ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3961
3524
  #endif
3962
3525
  }
3963
3526
 
@@ -4078,37 +3641,10 @@ void wsp_ggml_vec_dot_iq4_xs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
4078
3641
  *s = sumf;
4079
3642
 
4080
3643
  #else
4081
- float sumf = 0;
4082
- for (int ibl = 0; ibl < nb; ++ibl) {
4083
- const float d4d8 = WSP_GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4084
- uint16_t h = x[ibl].scales_h;
4085
- const uint8_t * qs = x[ibl].qs;
4086
- const int8_t * q8 = y[ibl].qs;
4087
- for (int ib = 0; ib < QK_K/32; ib += 2) {
4088
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
4089
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
4090
- h >>= 4;
4091
- const float d1 = d4d8*(ls1 - 32);
4092
- const float d2 = d4d8*(ls2 - 32);
4093
- int sumi1 = 0, sumi2 = 0;
4094
- for (int j = 0; j < 16; ++j) {
4095
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4096
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4097
- }
4098
- sumf += d1 * (sumi1 + sumi2);
4099
- qs += 16;
4100
- q8 += 32;
4101
- sumi1 = sumi2 = 0;
4102
- for (int j = 0; j < 16; ++j) {
4103
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4104
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4105
- }
4106
- sumf += d2 * (sumi1 + sumi2);
4107
- qs += 16;
4108
- q8 += 32;
4109
- }
4110
- }
4111
- *s = sumf;
3644
+ UNUSED(x);
3645
+ UNUSED(y);
3646
+ UNUSED(nb);
3647
+ wsp_ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4112
3648
  #endif
4113
3649
  }
4114
3650