whisper.rn 0.5.0-rc.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/ggml-alloc.c +1 -15
- package/cpp/ggml-backend-reg.cpp +17 -8
- package/cpp/ggml-backend.cpp +15 -22
- package/cpp/ggml-common.h +17 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/ggml-cpu/arch-fallback.h +34 -0
- package/cpp/ggml-cpu/ggml-cpu.c +22 -1
- package/cpp/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/ggml-cpu/ops.cpp +870 -211
- package/cpp/ggml-cpu/ops.h +3 -8
- package/cpp/ggml-cpu/quants.c +35 -0
- package/cpp/ggml-cpu/quants.h +8 -0
- package/cpp/ggml-cpu/repack.cpp +458 -47
- package/cpp/ggml-cpu/repack.h +22 -0
- package/cpp/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/ggml-cpu/traits.cpp +2 -2
- package/cpp/ggml-cpu/traits.h +1 -1
- package/cpp/ggml-cpu/vec.cpp +12 -9
- package/cpp/ggml-cpu/vec.h +107 -13
- package/cpp/ggml-impl.h +77 -0
- package/cpp/ggml-metal-impl.h +51 -12
- package/cpp/ggml-metal.m +610 -115
- package/cpp/ggml-opt.cpp +97 -41
- package/cpp/ggml-opt.h +25 -6
- package/cpp/ggml-quants.c +110 -16
- package/cpp/ggml-quants.h +6 -0
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +314 -88
- package/cpp/ggml.h +137 -11
- package/cpp/gguf.cpp +8 -1
- package/cpp/jsi/RNWhisperJSI.cpp +23 -6
- package/cpp/whisper.cpp +15 -6
- package/ios/RNWhisper.mm +6 -6
- package/ios/RNWhisperContext.mm +2 -0
- package/ios/RNWhisperVadContext.mm +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +28 -2
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/module/realtime-transcription/RealtimeTranscriber.js +28 -2
- package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +1 -0
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
- package/lib/typescript/realtime-transcription/types.d.ts +6 -0
- package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/realtime-transcription/RealtimeTranscriber.ts +32 -0
- package/src/realtime-transcription/types.ts +6 -0
|
@@ -589,6 +589,67 @@ void wsp_ggml_vec_dot_q4_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
589
589
|
*s = sumf;
|
|
590
590
|
}
|
|
591
591
|
|
|
592
|
+
void wsp_ggml_vec_dot_mxfp4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
|
+
assert(nrc == 1);
|
|
594
|
+
UNUSED(nrc);
|
|
595
|
+
UNUSED(bx);
|
|
596
|
+
UNUSED(by);
|
|
597
|
+
UNUSED(bs);
|
|
598
|
+
assert(n % QK_MXFP4 == 0);
|
|
599
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
600
|
+
|
|
601
|
+
const block_mxfp4 * WSP_GGML_RESTRICT x = vx;
|
|
602
|
+
const block_q8_0 * WSP_GGML_RESTRICT y = vy;
|
|
603
|
+
|
|
604
|
+
const int nb = n / QK_MXFP4;
|
|
605
|
+
|
|
606
|
+
int ib = 0;
|
|
607
|
+
float sumf = 0;
|
|
608
|
+
|
|
609
|
+
#if defined __ARM_NEON
|
|
610
|
+
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
|
611
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
612
|
+
uint8x16x2_t q4bits;
|
|
613
|
+
int8x16x4_t q4b;
|
|
614
|
+
int8x16x4_t q8b;
|
|
615
|
+
int32x4_t prod_1;
|
|
616
|
+
int32x4_t prod_2;
|
|
617
|
+
|
|
618
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
619
|
+
q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
|
|
620
|
+
q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
|
|
621
|
+
q8b.val[0] = vld1q_s8(y[ib + 0].qs);
|
|
622
|
+
q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
|
|
623
|
+
q8b.val[2] = vld1q_s8(y[ib + 1].qs);
|
|
624
|
+
q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
|
|
625
|
+
|
|
626
|
+
q4b.val[0] = wsp_ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
|
627
|
+
q4b.val[1] = wsp_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
|
628
|
+
q4b.val[2] = wsp_ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
|
629
|
+
q4b.val[3] = wsp_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
|
630
|
+
|
|
631
|
+
prod_1 = wsp_ggml_vdotq_s32(wsp_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
|
632
|
+
prod_2 = wsp_ggml_vdotq_s32(wsp_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
633
|
+
|
|
634
|
+
sumf +=
|
|
635
|
+
WSP_GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * WSP_GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
|
636
|
+
WSP_GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * WSP_GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
#endif
|
|
640
|
+
for (; ib < nb; ++ib) {
|
|
641
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(y[ib].d)*WSP_GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
|
642
|
+
int sumi1 = 0;
|
|
643
|
+
int sumi2 = 0;
|
|
644
|
+
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
|
645
|
+
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
|
646
|
+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
|
647
|
+
}
|
|
648
|
+
sumf += d * (sumi1 + sumi2);
|
|
649
|
+
}
|
|
650
|
+
*s = sumf;
|
|
651
|
+
}
|
|
652
|
+
|
|
592
653
|
void wsp_ggml_vec_dot_q5_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
654
|
const int qk = QK8_0;
|
|
594
655
|
const int nb = n / qk;
|
|
@@ -1236,44 +1297,10 @@ void wsp_ggml_vec_dot_tq1_0_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
1236
1297
|
*s = sumf;
|
|
1237
1298
|
|
|
1238
1299
|
#else
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
for (int i = 0; i < nb; ++i) {
|
|
1244
|
-
int sum = 0;
|
|
1245
|
-
|
|
1246
|
-
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
|
1247
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1248
|
-
for (size_t m = 0; m < 32; ++m) {
|
|
1249
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1250
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1251
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
|
1252
|
-
}
|
|
1253
|
-
}
|
|
1254
|
-
}
|
|
1255
|
-
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
|
1256
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1257
|
-
for (size_t m = 0; m < 16; ++m) {
|
|
1258
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1259
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1260
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
|
1261
|
-
}
|
|
1262
|
-
}
|
|
1263
|
-
}
|
|
1264
|
-
|
|
1265
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1266
|
-
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
|
1267
|
-
uint8_t q = x[i].qh[j] * pow3[l];
|
|
1268
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1269
|
-
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
|
1270
|
-
}
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
sumf += (float) sum * (WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
1274
|
-
}
|
|
1275
|
-
|
|
1276
|
-
*s = sumf;
|
|
1300
|
+
UNUSED(x);
|
|
1301
|
+
UNUSED(y);
|
|
1302
|
+
UNUSED(nb);
|
|
1303
|
+
wsp_ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1277
1304
|
#endif
|
|
1278
1305
|
}
|
|
1279
1306
|
|
|
@@ -1381,25 +1408,10 @@ void wsp_ggml_vec_dot_tq2_0_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
1381
1408
|
*s = sumf;
|
|
1382
1409
|
|
|
1383
1410
|
#else
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
|
1390
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1391
|
-
for (size_t k = 0; k < 32; ++k) {
|
|
1392
|
-
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
|
1393
|
-
}
|
|
1394
|
-
}
|
|
1395
|
-
}
|
|
1396
|
-
|
|
1397
|
-
const float d = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1398
|
-
|
|
1399
|
-
sumf += (float) sumi * d;
|
|
1400
|
-
}
|
|
1401
|
-
|
|
1402
|
-
*s = sumf;
|
|
1411
|
+
UNUSED(x);
|
|
1412
|
+
UNUSED(y);
|
|
1413
|
+
UNUSED(nb);
|
|
1414
|
+
wsp_ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1403
1415
|
#endif
|
|
1404
1416
|
}
|
|
1405
1417
|
|
|
@@ -1729,45 +1741,10 @@ void wsp_ggml_vec_dot_q2_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
1729
1741
|
*s = sum;
|
|
1730
1742
|
|
|
1731
1743
|
#else
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
const uint8_t * q2 = x[i].qs;
|
|
1738
|
-
const int8_t * q8 = y[i].qs;
|
|
1739
|
-
const uint8_t * sc = x[i].scales;
|
|
1740
|
-
|
|
1741
|
-
int summs = 0;
|
|
1742
|
-
for (int j = 0; j < 16; ++j) {
|
|
1743
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1744
|
-
}
|
|
1745
|
-
|
|
1746
|
-
const float dall = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1747
|
-
const float dmin = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1748
|
-
|
|
1749
|
-
int isum = 0;
|
|
1750
|
-
int is = 0;
|
|
1751
|
-
int d;
|
|
1752
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
1753
|
-
int shift = 0;
|
|
1754
|
-
for (int j = 0; j < 4; ++j) {
|
|
1755
|
-
d = sc[is++] & 0xF;
|
|
1756
|
-
int isuml = 0;
|
|
1757
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1758
|
-
isum += d * isuml;
|
|
1759
|
-
d = sc[is++] & 0xF;
|
|
1760
|
-
isuml = 0;
|
|
1761
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1762
|
-
isum += d * isuml;
|
|
1763
|
-
shift += 2;
|
|
1764
|
-
q8 += 32;
|
|
1765
|
-
}
|
|
1766
|
-
q2 += 32;
|
|
1767
|
-
}
|
|
1768
|
-
sumf += dall * isum - dmin * summs;
|
|
1769
|
-
}
|
|
1770
|
-
*s = sumf;
|
|
1744
|
+
UNUSED(x);
|
|
1745
|
+
UNUSED(y);
|
|
1746
|
+
UNUSED(nb);
|
|
1747
|
+
wsp_ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1771
1748
|
#endif
|
|
1772
1749
|
}
|
|
1773
1750
|
|
|
@@ -2057,68 +2034,12 @@ void wsp_ggml_vec_dot_q3_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
2057
2034
|
*s = sum;
|
|
2058
2035
|
|
|
2059
2036
|
#else
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
2067
|
-
|
|
2068
|
-
int8_t aux8[QK_K];
|
|
2069
|
-
int16_t aux16[8];
|
|
2070
|
-
float sums [8];
|
|
2071
|
-
int32_t aux32[8];
|
|
2072
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2073
|
-
|
|
2074
|
-
uint32_t auxs[4];
|
|
2075
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
2076
|
-
|
|
2077
|
-
float sumf = 0;
|
|
2078
|
-
for (int i = 0; i < nb; ++i) {
|
|
2079
|
-
const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
|
|
2080
|
-
const uint8_t * WSP_GGML_RESTRICT hm = x[i].hmask;
|
|
2081
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2082
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2083
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
2084
|
-
uint8_t m = 1;
|
|
2085
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
2086
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
2087
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2088
|
-
a += 32; m <<= 1;
|
|
2089
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
2090
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2091
|
-
a += 32; m <<= 1;
|
|
2092
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
2093
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2094
|
-
a += 32; m <<= 1;
|
|
2095
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
2096
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2097
|
-
a += 32; m <<= 1;
|
|
2098
|
-
q3 += 32;
|
|
2099
|
-
}
|
|
2100
|
-
a = aux8;
|
|
2101
|
-
|
|
2102
|
-
memcpy(auxs, x[i].scales, 12);
|
|
2103
|
-
uint32_t tmp = auxs[2];
|
|
2104
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
2105
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
2106
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
2107
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
2108
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
2109
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2110
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
2111
|
-
q8 += 8; a += 8;
|
|
2112
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2113
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
2114
|
-
q8 += 8; a += 8;
|
|
2115
|
-
}
|
|
2116
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2117
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2118
|
-
}
|
|
2119
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2120
|
-
*s = sumf;
|
|
2121
|
-
|
|
2037
|
+
UNUSED(kmask1);
|
|
2038
|
+
UNUSED(kmask2);
|
|
2039
|
+
UNUSED(x);
|
|
2040
|
+
UNUSED(y);
|
|
2041
|
+
UNUSED(nb);
|
|
2042
|
+
wsp_ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2122
2043
|
#endif
|
|
2123
2044
|
|
|
2124
2045
|
}
|
|
@@ -2431,61 +2352,14 @@ void wsp_ggml_vec_dot_q4_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
2431
2352
|
*s = sumf;
|
|
2432
2353
|
|
|
2433
2354
|
#else
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2443
|
-
|
|
2444
|
-
float sumf = 0;
|
|
2445
|
-
for (int i = 0; i < nb; ++i) {
|
|
2446
|
-
const uint8_t * WSP_GGML_RESTRICT q4 = x[i].qs;
|
|
2447
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2448
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2449
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
2450
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2451
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2452
|
-
a += 32;
|
|
2453
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2454
|
-
a += 32; q4 += 32;
|
|
2455
|
-
}
|
|
2456
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2457
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2458
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2459
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2460
|
-
utmp[2] = uaux;
|
|
2461
|
-
utmp[0] &= kmask1;
|
|
2462
|
-
|
|
2463
|
-
int sumi = 0;
|
|
2464
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2465
|
-
a = aux8;
|
|
2466
|
-
int is = 0;
|
|
2467
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2468
|
-
int32_t scale = scales[is++];
|
|
2469
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2470
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2471
|
-
q8 += 8; a += 8;
|
|
2472
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2473
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2474
|
-
q8 += 8; a += 8;
|
|
2475
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2476
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2477
|
-
q8 += 8; a += 8;
|
|
2478
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2479
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2480
|
-
q8 += 8; a += 8;
|
|
2481
|
-
}
|
|
2482
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2483
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2484
|
-
const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2485
|
-
sumf -= dmin * sumi;
|
|
2486
|
-
}
|
|
2487
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2488
|
-
*s = sumf;
|
|
2355
|
+
UNUSED(x);
|
|
2356
|
+
UNUSED(y);
|
|
2357
|
+
UNUSED(nb);
|
|
2358
|
+
UNUSED(kmask1);
|
|
2359
|
+
UNUSED(kmask2);
|
|
2360
|
+
UNUSED(kmask3);
|
|
2361
|
+
UNUSED(utmp);
|
|
2362
|
+
wsp_ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2489
2363
|
#endif
|
|
2490
2364
|
}
|
|
2491
2365
|
|
|
@@ -2578,66 +2452,14 @@ void wsp_ggml_vec_dot_q5_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
2578
2452
|
*s = sumf;
|
|
2579
2453
|
|
|
2580
2454
|
#else
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2590
|
-
|
|
2591
|
-
float sumf = 0;
|
|
2592
|
-
for (int i = 0; i < nb; ++i) {
|
|
2593
|
-
const uint8_t * WSP_GGML_RESTRICT q4 = x[i].qs;
|
|
2594
|
-
const uint8_t * WSP_GGML_RESTRICT hm = x[i].qh;
|
|
2595
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2596
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2597
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
2598
|
-
uint8_t m = 1;
|
|
2599
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2600
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2601
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2602
|
-
a += 32; m <<= 1;
|
|
2603
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2604
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2605
|
-
a += 32; m <<= 1;
|
|
2606
|
-
q4 += 32;
|
|
2607
|
-
}
|
|
2608
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2609
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2610
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2611
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2612
|
-
utmp[2] = uaux;
|
|
2613
|
-
utmp[0] &= kmask1;
|
|
2614
|
-
|
|
2615
|
-
int sumi = 0;
|
|
2616
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2617
|
-
a = aux8;
|
|
2618
|
-
int is = 0;
|
|
2619
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2620
|
-
int32_t scale = scales[is++];
|
|
2621
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2622
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2623
|
-
q8 += 8; a += 8;
|
|
2624
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2625
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2626
|
-
q8 += 8; a += 8;
|
|
2627
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2628
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2629
|
-
q8 += 8; a += 8;
|
|
2630
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2631
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2632
|
-
q8 += 8; a += 8;
|
|
2633
|
-
}
|
|
2634
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2635
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2636
|
-
const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2637
|
-
sumf -= dmin * sumi;
|
|
2638
|
-
}
|
|
2639
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2640
|
-
*s = sumf;
|
|
2455
|
+
UNUSED(x);
|
|
2456
|
+
UNUSED(y);
|
|
2457
|
+
UNUSED(nb);
|
|
2458
|
+
UNUSED(kmask1);
|
|
2459
|
+
UNUSED(kmask2);
|
|
2460
|
+
UNUSED(kmask3);
|
|
2461
|
+
UNUSED(utmp);
|
|
2462
|
+
wsp_ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2641
2463
|
#endif
|
|
2642
2464
|
}
|
|
2643
2465
|
|
|
@@ -3093,47 +2915,10 @@ void wsp_ggml_vec_dot_q6_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
3093
2915
|
}
|
|
3094
2916
|
*s = sum;
|
|
3095
2917
|
#else
|
|
3096
|
-
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
int32_t aux32[8];
|
|
3101
|
-
memset(sums, 0, 8*sizeof(float));
|
|
3102
|
-
|
|
3103
|
-
float sumf = 0;
|
|
3104
|
-
for (int i = 0; i < nb; ++i) {
|
|
3105
|
-
const uint8_t * WSP_GGML_RESTRICT q4 = x[i].ql;
|
|
3106
|
-
const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
|
|
3107
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3108
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
3109
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
3110
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
3111
|
-
for (int l = 0; l < 32; ++l) {
|
|
3112
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
3113
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
3114
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
3115
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
3116
|
-
}
|
|
3117
|
-
a += 128;
|
|
3118
|
-
q4 += 64;
|
|
3119
|
-
qh += 32;
|
|
3120
|
-
}
|
|
3121
|
-
a = aux8;
|
|
3122
|
-
int is = 0;
|
|
3123
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
3124
|
-
int scale = x[i].scales[is++];
|
|
3125
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
3126
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
3127
|
-
q8 += 8; a += 8;
|
|
3128
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
3129
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
3130
|
-
q8 += 8; a += 8;
|
|
3131
|
-
}
|
|
3132
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3133
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
3134
|
-
}
|
|
3135
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
3136
|
-
*s = sumf;
|
|
2918
|
+
UNUSED(x);
|
|
2919
|
+
UNUSED(y);
|
|
2920
|
+
UNUSED(nb);
|
|
2921
|
+
wsp_ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3137
2922
|
#endif
|
|
3138
2923
|
}
|
|
3139
2924
|
|
|
@@ -3229,34 +3014,10 @@ void wsp_ggml_vec_dot_iq2_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs
|
|
|
3229
3014
|
*s = 0.25f * sumf;
|
|
3230
3015
|
|
|
3231
3016
|
#else
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
float sumf = 0.f;
|
|
3237
|
-
for (int i = 0; i < nb; ++i) {
|
|
3238
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3239
|
-
const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
|
|
3240
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3241
|
-
int32_t bsum = 0;
|
|
3242
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3243
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
3244
|
-
q2 += 4;
|
|
3245
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
3246
|
-
int32_t sumi = 0;
|
|
3247
|
-
for (int l = 0; l < 4; ++l) {
|
|
3248
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
3249
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
3250
|
-
for (int j = 0; j < 8; ++j) {
|
|
3251
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3252
|
-
}
|
|
3253
|
-
q8 += 8;
|
|
3254
|
-
}
|
|
3255
|
-
bsum += sumi * ls;
|
|
3256
|
-
}
|
|
3257
|
-
sumf += d * bsum;
|
|
3258
|
-
}
|
|
3259
|
-
*s = 0.125f * sumf;
|
|
3017
|
+
UNUSED(x);
|
|
3018
|
+
UNUSED(y);
|
|
3019
|
+
UNUSED(nb);
|
|
3020
|
+
wsp_ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3260
3021
|
#endif
|
|
3261
3022
|
}
|
|
3262
3023
|
|
|
@@ -3327,42 +3088,10 @@ void wsp_ggml_vec_dot_iq2_xs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3327
3088
|
*s = 0.125f * sumf;
|
|
3328
3089
|
|
|
3329
3090
|
#else
|
|
3330
|
-
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
|
|
3335
|
-
const uint8_t * WSP_GGML_RESTRICT sc = x[i].scales;
|
|
3336
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3337
|
-
int32_t bsum = 0;
|
|
3338
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3339
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
3340
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
3341
|
-
int32_t sumi = 0;
|
|
3342
|
-
for (int l = 0; l < 2; ++l) {
|
|
3343
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3344
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3345
|
-
for (int j = 0; j < 8; ++j) {
|
|
3346
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3347
|
-
}
|
|
3348
|
-
q8 += 8;
|
|
3349
|
-
}
|
|
3350
|
-
bsum += sumi * ls1;
|
|
3351
|
-
sumi = 0;
|
|
3352
|
-
for (int l = 2; l < 4; ++l) {
|
|
3353
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3354
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3355
|
-
for (int j = 0; j < 8; ++j) {
|
|
3356
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3357
|
-
}
|
|
3358
|
-
q8 += 8;
|
|
3359
|
-
}
|
|
3360
|
-
bsum += sumi * ls2;
|
|
3361
|
-
q2 += 4;
|
|
3362
|
-
}
|
|
3363
|
-
sumf += d * bsum;
|
|
3364
|
-
}
|
|
3365
|
-
*s = 0.125f * sumf;
|
|
3091
|
+
UNUSED(x);
|
|
3092
|
+
UNUSED(y);
|
|
3093
|
+
UNUSED(nb);
|
|
3094
|
+
wsp_ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3366
3095
|
#endif
|
|
3367
3096
|
}
|
|
3368
3097
|
|
|
@@ -3455,45 +3184,10 @@ void wsp_ggml_vec_dot_iq2_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3455
3184
|
*s = 0.125f * sumf;
|
|
3456
3185
|
|
|
3457
3186
|
#else
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3463
|
-
const int8_t * q8 = y[i].qs;
|
|
3464
|
-
const uint8_t * qs = x[i].qs;
|
|
3465
|
-
const uint8_t * qh = x[i].qh;
|
|
3466
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
3467
|
-
|
|
3468
|
-
int bsum = 0;
|
|
3469
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3470
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
3471
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
3472
|
-
int sumi1 = 0, sumi2 = 0;
|
|
3473
|
-
for (int l = 0; l < 2; ++l) {
|
|
3474
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3475
|
-
for (int j = 0; j < 8; ++j) {
|
|
3476
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3477
|
-
}
|
|
3478
|
-
q8 += 8;
|
|
3479
|
-
}
|
|
3480
|
-
for (int l = 2; l < 4; ++l) {
|
|
3481
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3482
|
-
for (int j = 0; j < 8; ++j) {
|
|
3483
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3484
|
-
}
|
|
3485
|
-
q8 += 8;
|
|
3486
|
-
}
|
|
3487
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
3488
|
-
qs += 4;
|
|
3489
|
-
signs += 4;
|
|
3490
|
-
}
|
|
3491
|
-
|
|
3492
|
-
sumf += d * bsum;
|
|
3493
|
-
}
|
|
3494
|
-
|
|
3495
|
-
*s = 0.125f * sumf;
|
|
3496
|
-
|
|
3187
|
+
UNUSED(x);
|
|
3188
|
+
UNUSED(y);
|
|
3189
|
+
UNUSED(nb);
|
|
3190
|
+
wsp_ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3497
3191
|
#endif
|
|
3498
3192
|
|
|
3499
3193
|
}
|
|
@@ -3553,36 +3247,10 @@ void wsp_ggml_vec_dot_iq3_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs
|
|
|
3553
3247
|
*s = 0.5f * sumf;
|
|
3554
3248
|
|
|
3555
3249
|
#else
|
|
3556
|
-
|
|
3557
|
-
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
for (int i = 0; i < nb; ++i) {
|
|
3561
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3562
|
-
const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
|
|
3563
|
-
const uint8_t * WSP_GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3564
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3565
|
-
int32_t bsum = 0;
|
|
3566
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3567
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
3568
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
3569
|
-
int32_t sumi = 0;
|
|
3570
|
-
for (int l = 0; l < 4; ++l) {
|
|
3571
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
3572
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
3573
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
3574
|
-
for (int j = 0; j < 4; ++j) {
|
|
3575
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3576
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3577
|
-
}
|
|
3578
|
-
q8 += 8;
|
|
3579
|
-
}
|
|
3580
|
-
q3 += 8;
|
|
3581
|
-
bsum += sumi * ls;
|
|
3582
|
-
}
|
|
3583
|
-
sumf += d * bsum;
|
|
3584
|
-
}
|
|
3585
|
-
*s = 0.25f * sumf;
|
|
3250
|
+
UNUSED(x);
|
|
3251
|
+
UNUSED(y);
|
|
3252
|
+
UNUSED(nb);
|
|
3253
|
+
wsp_ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3586
3254
|
#endif
|
|
3587
3255
|
}
|
|
3588
3256
|
|
|
@@ -3689,48 +3357,10 @@ void wsp_ggml_vec_dot_iq3_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3689
3357
|
*s = sumf;
|
|
3690
3358
|
|
|
3691
3359
|
#else
|
|
3692
|
-
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3696
|
-
const uint8_t * WSP_GGML_RESTRICT qs = x[i].qs;
|
|
3697
|
-
const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
|
|
3698
|
-
const uint8_t * WSP_GGML_RESTRICT signs = x[i].signs;
|
|
3699
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3700
|
-
int32_t bsum = 0;
|
|
3701
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
3702
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
3703
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
3704
|
-
int32_t sumi = 0;
|
|
3705
|
-
for (int l = 0; l < 4; ++l) {
|
|
3706
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
3707
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
3708
|
-
for (int j = 0; j < 4; ++j) {
|
|
3709
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3710
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3711
|
-
}
|
|
3712
|
-
q8 += 8;
|
|
3713
|
-
}
|
|
3714
|
-
qs += 8;
|
|
3715
|
-
signs += 4;
|
|
3716
|
-
bsum += sumi * ls1;
|
|
3717
|
-
sumi = 0;
|
|
3718
|
-
for (int l = 0; l < 4; ++l) {
|
|
3719
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
3720
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
3721
|
-
for (int j = 0; j < 4; ++j) {
|
|
3722
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3723
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3724
|
-
}
|
|
3725
|
-
q8 += 8;
|
|
3726
|
-
}
|
|
3727
|
-
qs += 8;
|
|
3728
|
-
signs += 4;
|
|
3729
|
-
bsum += sumi * ls2;
|
|
3730
|
-
}
|
|
3731
|
-
sumf += d * bsum;
|
|
3732
|
-
}
|
|
3733
|
-
*s = sumf;
|
|
3360
|
+
UNUSED(x);
|
|
3361
|
+
UNUSED(y);
|
|
3362
|
+
UNUSED(nb);
|
|
3363
|
+
wsp_ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3734
3364
|
#endif
|
|
3735
3365
|
}
|
|
3736
3366
|
|
|
@@ -3793,36 +3423,10 @@ void wsp_ggml_vec_dot_iq1_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3793
3423
|
*s = sumf;
|
|
3794
3424
|
|
|
3795
3425
|
#else
|
|
3796
|
-
|
|
3797
|
-
|
|
3798
|
-
|
|
3799
|
-
|
|
3800
|
-
const int8_t * q8 = y[i].qs;
|
|
3801
|
-
const uint8_t * qs = x[i].qs;
|
|
3802
|
-
const uint16_t * qh = x[i].qh;
|
|
3803
|
-
|
|
3804
|
-
int sumi = 0, sumi1 = 0;
|
|
3805
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
3806
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
3807
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
3808
|
-
int lsum = 0;
|
|
3809
|
-
for (int l = 0; l < 4; ++l) {
|
|
3810
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
3811
|
-
for (int j = 0; j < 8; ++j) {
|
|
3812
|
-
lsum += q8[j] * grid[j];
|
|
3813
|
-
}
|
|
3814
|
-
q8 += 8;
|
|
3815
|
-
}
|
|
3816
|
-
sumi += ls * lsum;
|
|
3817
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
3818
|
-
qs += 4;
|
|
3819
|
-
}
|
|
3820
|
-
|
|
3821
|
-
sumf += WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
3822
|
-
}
|
|
3823
|
-
|
|
3824
|
-
*s = sumf;
|
|
3825
|
-
|
|
3426
|
+
UNUSED(x);
|
|
3427
|
+
UNUSED(y);
|
|
3428
|
+
UNUSED(nb);
|
|
3429
|
+
wsp_ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3826
3430
|
#endif
|
|
3827
3431
|
}
|
|
3828
3432
|
|
|
@@ -3912,52 +3516,11 @@ void wsp_ggml_vec_dot_iq1_m_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3912
3516
|
*s = sumf;
|
|
3913
3517
|
|
|
3914
3518
|
#else
|
|
3915
|
-
|
|
3916
|
-
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
const int8_t * q8 = y[i].qs;
|
|
3922
|
-
const uint8_t * qs = x[i].qs;
|
|
3923
|
-
const uint8_t * qh = x[i].qh;
|
|
3924
|
-
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
3925
|
-
|
|
3926
|
-
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
3927
|
-
|
|
3928
|
-
int sumi1 = 0, sumi2 = 0;
|
|
3929
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
3930
|
-
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
|
3931
|
-
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
|
3932
|
-
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
|
3933
|
-
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
|
3934
|
-
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
|
3935
|
-
for (int l = 0; l < 4; ++l) {
|
|
3936
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
|
3937
|
-
int lsum1 = 0, lsum2 = 0;
|
|
3938
|
-
for (int j = 0; j < 8; ++j) {
|
|
3939
|
-
lsum1 += q8[j] * grid[j];
|
|
3940
|
-
lsum2 += q8[j];
|
|
3941
|
-
}
|
|
3942
|
-
q8 += 8;
|
|
3943
|
-
sum1[l/2] += lsum1;
|
|
3944
|
-
sum2[l/2] += lsum2*delta[l];
|
|
3945
|
-
}
|
|
3946
|
-
|
|
3947
|
-
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
|
3948
|
-
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
|
3949
|
-
|
|
3950
|
-
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
|
3951
|
-
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
|
3952
|
-
qs += 4;
|
|
3953
|
-
qh += 2;
|
|
3954
|
-
}
|
|
3955
|
-
|
|
3956
|
-
sumf += WSP_GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
3957
|
-
}
|
|
3958
|
-
|
|
3959
|
-
*s = sumf;
|
|
3960
|
-
|
|
3519
|
+
UNUSED(x);
|
|
3520
|
+
UNUSED(y);
|
|
3521
|
+
UNUSED(nb);
|
|
3522
|
+
UNUSED(scale);
|
|
3523
|
+
wsp_ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3961
3524
|
#endif
|
|
3962
3525
|
}
|
|
3963
3526
|
|
|
@@ -4078,37 +3641,10 @@ void wsp_ggml_vec_dot_iq4_xs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
4078
3641
|
*s = sumf;
|
|
4079
3642
|
|
|
4080
3643
|
#else
|
|
4081
|
-
|
|
4082
|
-
|
|
4083
|
-
|
|
4084
|
-
|
|
4085
|
-
const uint8_t * qs = x[ibl].qs;
|
|
4086
|
-
const int8_t * q8 = y[ibl].qs;
|
|
4087
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
4088
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
4089
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
4090
|
-
h >>= 4;
|
|
4091
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
4092
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
4093
|
-
int sumi1 = 0, sumi2 = 0;
|
|
4094
|
-
for (int j = 0; j < 16; ++j) {
|
|
4095
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4096
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4097
|
-
}
|
|
4098
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
4099
|
-
qs += 16;
|
|
4100
|
-
q8 += 32;
|
|
4101
|
-
sumi1 = sumi2 = 0;
|
|
4102
|
-
for (int j = 0; j < 16; ++j) {
|
|
4103
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4104
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4105
|
-
}
|
|
4106
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
4107
|
-
qs += 16;
|
|
4108
|
-
q8 += 32;
|
|
4109
|
-
}
|
|
4110
|
-
}
|
|
4111
|
-
*s = sumf;
|
|
3644
|
+
UNUSED(x);
|
|
3645
|
+
UNUSED(y);
|
|
3646
|
+
UNUSED(nb);
|
|
3647
|
+
wsp_ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
4112
3648
|
#endif
|
|
4113
3649
|
}
|
|
4114
3650
|
|