whisper.rn 0.5.0-rc.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/ggml-alloc.c +1 -15
- package/cpp/ggml-backend-reg.cpp +17 -8
- package/cpp/ggml-backend.cpp +15 -22
- package/cpp/ggml-common.h +17 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/ggml-cpu/arch-fallback.h +34 -0
- package/cpp/ggml-cpu/ggml-cpu.c +22 -1
- package/cpp/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/ggml-cpu/ops.cpp +870 -211
- package/cpp/ggml-cpu/ops.h +3 -8
- package/cpp/ggml-cpu/quants.c +35 -0
- package/cpp/ggml-cpu/quants.h +8 -0
- package/cpp/ggml-cpu/repack.cpp +458 -47
- package/cpp/ggml-cpu/repack.h +22 -0
- package/cpp/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/ggml-cpu/traits.cpp +2 -2
- package/cpp/ggml-cpu/traits.h +1 -1
- package/cpp/ggml-cpu/vec.cpp +12 -9
- package/cpp/ggml-cpu/vec.h +107 -13
- package/cpp/ggml-impl.h +77 -0
- package/cpp/ggml-metal-impl.h +51 -12
- package/cpp/ggml-metal.m +610 -115
- package/cpp/ggml-opt.cpp +97 -41
- package/cpp/ggml-opt.h +25 -6
- package/cpp/ggml-quants.c +110 -16
- package/cpp/ggml-quants.h +6 -0
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +314 -88
- package/cpp/ggml.h +137 -11
- package/cpp/gguf.cpp +8 -1
- package/cpp/jsi/RNWhisperJSI.cpp +23 -6
- package/cpp/whisper.cpp +15 -6
- package/ios/RNWhisper.mm +6 -6
- package/ios/RNWhisperContext.mm +2 -0
- package/ios/RNWhisperVadContext.mm +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +77 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +51 -12
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +137 -11
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +28 -2
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/module/realtime-transcription/RealtimeTranscriber.js +28 -2
- package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +1 -0
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
- package/lib/typescript/realtime-transcription/types.d.ts +6 -0
- package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/realtime-transcription/RealtimeTranscriber.ts +32 -0
- package/src/realtime-transcription/types.ts +6 -0
|
@@ -66,6 +66,12 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
|
66
66
|
}
|
|
67
67
|
|
|
68
68
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
|
69
|
+
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
70
|
+
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
71
|
+
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
72
|
+
return _mm256_maddubs_epi16(ax, sy);
|
|
73
|
+
}
|
|
74
|
+
|
|
69
75
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
|
70
76
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
71
77
|
uint32_t x32;
|
|
@@ -261,6 +267,11 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
|
|
|
261
267
|
return _mm256_set_m128(_mm_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(x1) * WSP_GGML_CPU_FP16_TO_FP32(y1)),
|
|
262
268
|
_mm_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(x0) * WSP_GGML_CPU_FP16_TO_FP32(y0)));
|
|
263
269
|
}
|
|
270
|
+
|
|
271
|
+
static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
|
|
272
|
+
return _mm256_set_m128(_mm_set1_ps(WSP_GGML_E8M0_TO_FP32_HALF(x1) * WSP_GGML_CPU_FP16_TO_FP32(y1)),
|
|
273
|
+
_mm_set1_ps(WSP_GGML_E8M0_TO_FP32_HALF(x0) * WSP_GGML_CPU_FP16_TO_FP32(y0)));
|
|
274
|
+
}
|
|
264
275
|
#endif
|
|
265
276
|
#elif defined(__SSSE3__)
|
|
266
277
|
// horizontally add 4x4 floats
|
|
@@ -702,7 +713,6 @@ void wsp_ggml_vec_dot_q4_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
702
713
|
const block_q8_1 * WSP_GGML_RESTRICT y = vy;
|
|
703
714
|
|
|
704
715
|
int ib = 0;
|
|
705
|
-
float sumf = 0;
|
|
706
716
|
|
|
707
717
|
#if defined(__AVX2__) || defined(__AVX__)
|
|
708
718
|
// Initialize accumulator with zeros
|
|
@@ -737,25 +747,98 @@ void wsp_ggml_vec_dot_q4_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
737
747
|
#endif
|
|
738
748
|
}
|
|
739
749
|
|
|
740
|
-
|
|
741
|
-
|
|
750
|
+
*s = hsum_float_8(acc) + summs;
|
|
751
|
+
#else
|
|
752
|
+
UNUSED(nb);
|
|
753
|
+
UNUSED(x);
|
|
754
|
+
UNUSED(y);
|
|
755
|
+
UNUSED(ib);
|
|
756
|
+
wsp_ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
742
757
|
#endif
|
|
743
|
-
|
|
744
|
-
int sumi0 = 0;
|
|
745
|
-
int sumi1 = 0;
|
|
758
|
+
}
|
|
746
759
|
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
760
|
+
void wsp_ggml_vec_dot_mxfp4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
761
|
+
assert(nrc == 1);
|
|
762
|
+
UNUSED(nrc);
|
|
763
|
+
UNUSED(bx);
|
|
764
|
+
UNUSED(by);
|
|
765
|
+
UNUSED(bs);
|
|
766
|
+
assert(n % QK_MXFP4 == 0);
|
|
767
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
750
768
|
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
}
|
|
769
|
+
const block_mxfp4 * WSP_GGML_RESTRICT x = vx;
|
|
770
|
+
const block_q8_0 * WSP_GGML_RESTRICT y = vy;
|
|
754
771
|
|
|
755
|
-
|
|
756
|
-
|
|
772
|
+
const int nb = n / QK_MXFP4;
|
|
773
|
+
|
|
774
|
+
int ib = 0;
|
|
775
|
+
float sumf = 0;
|
|
776
|
+
|
|
777
|
+
#if defined __AVX2__
|
|
778
|
+
|
|
779
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
|
|
780
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
781
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
|
782
|
+
|
|
783
|
+
__m256 accum1 = _mm256_setzero_ps();
|
|
784
|
+
__m256 accum2 = _mm256_setzero_ps();
|
|
785
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
786
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
|
|
787
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
|
|
788
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
|
|
789
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
|
|
790
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
791
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
792
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
793
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
794
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
795
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
796
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
797
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
|
798
|
+
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*WSP_GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
|
|
799
|
+
_mm256_cvtepi32_ps(p_1), accum1);
|
|
800
|
+
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*WSP_GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
|
|
801
|
+
_mm256_cvtepi32_ps(p_2), accum2);
|
|
757
802
|
}
|
|
758
803
|
|
|
804
|
+
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
|
805
|
+
|
|
806
|
+
#elif defined __AVX__
|
|
807
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
|
|
808
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
809
|
+
|
|
810
|
+
__m256 accum = _mm256_setzero_ps();
|
|
811
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
812
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
|
813
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
814
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
|
815
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
|
816
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
|
817
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
|
818
|
+
|
|
819
|
+
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
|
820
|
+
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
|
821
|
+
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
|
822
|
+
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
|
823
|
+
|
|
824
|
+
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
|
825
|
+
const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
|
|
826
|
+
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
sumf = hsum_float_8(accum);
|
|
830
|
+
|
|
831
|
+
#endif
|
|
832
|
+
for (; ib < nb; ++ib) {
|
|
833
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(y[ib].d)*WSP_GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
|
834
|
+
int sumi1 = 0;
|
|
835
|
+
int sumi2 = 0;
|
|
836
|
+
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
|
837
|
+
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
|
838
|
+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
|
839
|
+
}
|
|
840
|
+
sumf += d * (sumi1 + sumi2);
|
|
841
|
+
}
|
|
759
842
|
*s = sumf;
|
|
760
843
|
}
|
|
761
844
|
|
|
@@ -764,7 +847,6 @@ void wsp_ggml_vec_dot_q5_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
764
847
|
const int nb = n / qk;
|
|
765
848
|
|
|
766
849
|
int ib = 0;
|
|
767
|
-
float sumf = 0;
|
|
768
850
|
|
|
769
851
|
assert(n % qk == 0);
|
|
770
852
|
assert(qk == QK5_0);
|
|
@@ -799,7 +881,7 @@ void wsp_ggml_vec_dot_q5_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
799
881
|
acc = _mm256_fmadd_ps(d, q, acc);
|
|
800
882
|
}
|
|
801
883
|
|
|
802
|
-
|
|
884
|
+
*s = hsum_float_8(acc);
|
|
803
885
|
#elif defined(__AVX__)
|
|
804
886
|
// Initialize accumulator with zeros
|
|
805
887
|
__m256 acc = _mm256_setzero_ps();
|
|
@@ -830,32 +912,14 @@ void wsp_ggml_vec_dot_q5_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
830
912
|
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
|
831
913
|
}
|
|
832
914
|
|
|
833
|
-
|
|
834
|
-
|
|
915
|
+
*s = hsum_float_8(acc);
|
|
916
|
+
#else
|
|
917
|
+
UNUSED(nb);
|
|
918
|
+
UNUSED(ib);
|
|
919
|
+
UNUSED(x);
|
|
920
|
+
UNUSED(y);
|
|
921
|
+
wsp_ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
835
922
|
#endif
|
|
836
|
-
for (; ib < nb; ++ib) {
|
|
837
|
-
uint32_t qh;
|
|
838
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
839
|
-
|
|
840
|
-
int sumi0 = 0;
|
|
841
|
-
int sumi1 = 0;
|
|
842
|
-
|
|
843
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
844
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
845
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
846
|
-
|
|
847
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
848
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
849
|
-
|
|
850
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
851
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
852
|
-
}
|
|
853
|
-
|
|
854
|
-
int sumi = sumi0 + sumi1;
|
|
855
|
-
sumf += (WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
*s = sumf;
|
|
859
923
|
}
|
|
860
924
|
|
|
861
925
|
void wsp_ggml_vec_dot_q5_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -863,7 +927,6 @@ void wsp_ggml_vec_dot_q5_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
863
927
|
const int nb = n / qk;
|
|
864
928
|
|
|
865
929
|
int ib = 0;
|
|
866
|
-
float sumf = 0;
|
|
867
930
|
|
|
868
931
|
assert(n % qk == 0);
|
|
869
932
|
assert(qk == QK5_1);
|
|
@@ -901,7 +964,7 @@ void wsp_ggml_vec_dot_q5_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
901
964
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
|
902
965
|
}
|
|
903
966
|
|
|
904
|
-
|
|
967
|
+
*s = hsum_float_8(acc) + summs;
|
|
905
968
|
#elif defined(__AVX__)
|
|
906
969
|
// Initialize accumulator with zeros
|
|
907
970
|
__m256 acc = _mm256_setzero_ps();
|
|
@@ -935,32 +998,14 @@ void wsp_ggml_vec_dot_q5_1_q8_1(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
935
998
|
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
|
936
999
|
}
|
|
937
1000
|
|
|
938
|
-
|
|
939
|
-
|
|
1001
|
+
*s = hsum_float_8(acc) + summs;
|
|
1002
|
+
#else
|
|
1003
|
+
UNUSED(nb);
|
|
1004
|
+
UNUSED(ib);
|
|
1005
|
+
UNUSED(x);
|
|
1006
|
+
UNUSED(y);
|
|
1007
|
+
wsp_ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
940
1008
|
#endif
|
|
941
|
-
for (; ib < nb; ++ib) {
|
|
942
|
-
uint32_t qh;
|
|
943
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
944
|
-
|
|
945
|
-
int sumi0 = 0;
|
|
946
|
-
int sumi1 = 0;
|
|
947
|
-
|
|
948
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
949
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
950
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
951
|
-
|
|
952
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
953
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
954
|
-
|
|
955
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
956
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
957
|
-
}
|
|
958
|
-
|
|
959
|
-
int sumi = sumi0 + sumi1;
|
|
960
|
-
sumf += (WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + WSP_GGML_CPU_FP16_TO_FP32(x[ib].m)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
961
|
-
}
|
|
962
|
-
|
|
963
|
-
*s = sumf;
|
|
964
1009
|
}
|
|
965
1010
|
|
|
966
1011
|
void wsp_ggml_vec_dot_q8_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1017,7 +1062,6 @@ void wsp_ggml_vec_dot_q8_0_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
1017
1062
|
}
|
|
1018
1063
|
|
|
1019
1064
|
sumf = hsum_float_8(accum);
|
|
1020
|
-
|
|
1021
1065
|
#endif
|
|
1022
1066
|
for (; ib < nb; ++ib) {
|
|
1023
1067
|
int sumi = 0;
|
|
@@ -1157,44 +1201,10 @@ void wsp_ggml_vec_dot_tq1_0_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
1157
1201
|
*s = hsum_float_8(sumf);
|
|
1158
1202
|
|
|
1159
1203
|
#else
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
for (int i = 0; i < nb; ++i) {
|
|
1165
|
-
int sum = 0;
|
|
1166
|
-
|
|
1167
|
-
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
|
1168
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1169
|
-
for (size_t m = 0; m < 32; ++m) {
|
|
1170
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1171
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1172
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
|
1173
|
-
}
|
|
1174
|
-
}
|
|
1175
|
-
}
|
|
1176
|
-
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
|
1177
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1178
|
-
for (size_t m = 0; m < 16; ++m) {
|
|
1179
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1180
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1181
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
|
1182
|
-
}
|
|
1183
|
-
}
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1187
|
-
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
|
1188
|
-
uint8_t q = x[i].qh[j] * pow3[l];
|
|
1189
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1190
|
-
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
|
1191
|
-
}
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
sumf += (float) sum * (WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
1195
|
-
}
|
|
1196
|
-
|
|
1197
|
-
*s = sumf;
|
|
1204
|
+
UNUSED(x);
|
|
1205
|
+
UNUSED(y);
|
|
1206
|
+
UNUSED(nb);
|
|
1207
|
+
wsp_ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1198
1208
|
#endif
|
|
1199
1209
|
}
|
|
1200
1210
|
|
|
@@ -1257,25 +1267,10 @@ void wsp_ggml_vec_dot_tq2_0_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
1257
1267
|
*s = hsum_float_8(sumf);
|
|
1258
1268
|
|
|
1259
1269
|
#else
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
|
1266
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1267
|
-
for (size_t k = 0; k < 32; ++k) {
|
|
1268
|
-
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
|
1269
|
-
}
|
|
1270
|
-
}
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
const float d = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1274
|
-
|
|
1275
|
-
sumf += (float) sumi * d;
|
|
1276
|
-
}
|
|
1277
|
-
|
|
1278
|
-
*s = sumf;
|
|
1270
|
+
UNUSED(x);
|
|
1271
|
+
UNUSED(y);
|
|
1272
|
+
UNUSED(nb);
|
|
1273
|
+
wsp_ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1279
1274
|
#endif
|
|
1280
1275
|
}
|
|
1281
1276
|
|
|
@@ -1464,45 +1459,10 @@ void wsp_ggml_vec_dot_q2_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
1464
1459
|
*s = hsum_float_8(acc);
|
|
1465
1460
|
|
|
1466
1461
|
#else
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
const uint8_t * q2 = x[i].qs;
|
|
1473
|
-
const int8_t * q8 = y[i].qs;
|
|
1474
|
-
const uint8_t * sc = x[i].scales;
|
|
1475
|
-
|
|
1476
|
-
int summs = 0;
|
|
1477
|
-
for (int j = 0; j < 16; ++j) {
|
|
1478
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1479
|
-
}
|
|
1480
|
-
|
|
1481
|
-
const float dall = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1482
|
-
const float dmin = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1483
|
-
|
|
1484
|
-
int isum = 0;
|
|
1485
|
-
int is = 0;
|
|
1486
|
-
int d;
|
|
1487
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
1488
|
-
int shift = 0;
|
|
1489
|
-
for (int j = 0; j < 4; ++j) {
|
|
1490
|
-
d = sc[is++] & 0xF;
|
|
1491
|
-
int isuml = 0;
|
|
1492
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1493
|
-
isum += d * isuml;
|
|
1494
|
-
d = sc[is++] & 0xF;
|
|
1495
|
-
isuml = 0;
|
|
1496
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1497
|
-
isum += d * isuml;
|
|
1498
|
-
shift += 2;
|
|
1499
|
-
q8 += 32;
|
|
1500
|
-
}
|
|
1501
|
-
q2 += 32;
|
|
1502
|
-
}
|
|
1503
|
-
sumf += dall * isum - dmin * summs;
|
|
1504
|
-
}
|
|
1505
|
-
*s = sumf;
|
|
1462
|
+
UNUSED(x);
|
|
1463
|
+
UNUSED(y);
|
|
1464
|
+
UNUSED(nb);
|
|
1465
|
+
wsp_ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1506
1466
|
#endif
|
|
1507
1467
|
}
|
|
1508
1468
|
|
|
@@ -1769,70 +1729,13 @@ void wsp_ggml_vec_dot_q3_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
1769
1729
|
*s = hsum_float_8(acc);
|
|
1770
1730
|
|
|
1771
1731
|
#else
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
1779
|
-
|
|
1780
|
-
int8_t aux8[QK_K];
|
|
1781
|
-
int16_t aux16[8];
|
|
1782
|
-
float sums [8];
|
|
1783
|
-
int32_t aux32[8];
|
|
1784
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1785
|
-
|
|
1786
|
-
uint32_t auxs[4];
|
|
1787
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
1788
|
-
|
|
1789
|
-
float sumf = 0;
|
|
1790
|
-
for (int i = 0; i < nb; ++i) {
|
|
1791
|
-
const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
|
|
1792
|
-
const uint8_t * WSP_GGML_RESTRICT hm = x[i].hmask;
|
|
1793
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
1794
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1795
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
1796
|
-
uint8_t m = 1;
|
|
1797
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1798
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
1799
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1800
|
-
a += 32; m <<= 1;
|
|
1801
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
1802
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1803
|
-
a += 32; m <<= 1;
|
|
1804
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
1805
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1806
|
-
a += 32; m <<= 1;
|
|
1807
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
1808
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
1809
|
-
a += 32; m <<= 1;
|
|
1810
|
-
q3 += 32;
|
|
1811
|
-
}
|
|
1812
|
-
a = aux8;
|
|
1813
|
-
|
|
1814
|
-
memcpy(auxs, x[i].scales, 12);
|
|
1815
|
-
uint32_t tmp = auxs[2];
|
|
1816
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
1817
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
1818
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
1819
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
1820
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1821
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1822
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1823
|
-
q8 += 8; a += 8;
|
|
1824
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1825
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1826
|
-
q8 += 8; a += 8;
|
|
1827
|
-
}
|
|
1828
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1829
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1830
|
-
}
|
|
1831
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1832
|
-
*s = sumf;
|
|
1833
|
-
|
|
1732
|
+
UNUSED(kmask1);
|
|
1733
|
+
UNUSED(kmask2);
|
|
1734
|
+
UNUSED(x);
|
|
1735
|
+
UNUSED(y);
|
|
1736
|
+
UNUSED(nb);
|
|
1737
|
+
wsp_ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1834
1738
|
#endif
|
|
1835
|
-
|
|
1836
1739
|
}
|
|
1837
1740
|
|
|
1838
1741
|
void wsp_ggml_vec_dot_q4_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2002,61 +1905,14 @@ void wsp_ggml_vec_dot_q4_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
2002
1905
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
|
2003
1906
|
|
|
2004
1907
|
#else
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2014
|
-
|
|
2015
|
-
float sumf = 0;
|
|
2016
|
-
for (int i = 0; i < nb; ++i) {
|
|
2017
|
-
const uint8_t * WSP_GGML_RESTRICT q4 = x[i].qs;
|
|
2018
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2019
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2020
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
2021
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2022
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2023
|
-
a += 32;
|
|
2024
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2025
|
-
a += 32; q4 += 32;
|
|
2026
|
-
}
|
|
2027
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2028
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2029
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2030
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2031
|
-
utmp[2] = uaux;
|
|
2032
|
-
utmp[0] &= kmask1;
|
|
2033
|
-
|
|
2034
|
-
int sumi = 0;
|
|
2035
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2036
|
-
a = aux8;
|
|
2037
|
-
int is = 0;
|
|
2038
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2039
|
-
int32_t scale = scales[is++];
|
|
2040
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2041
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2042
|
-
q8 += 8; a += 8;
|
|
2043
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2044
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2045
|
-
q8 += 8; a += 8;
|
|
2046
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2047
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2048
|
-
q8 += 8; a += 8;
|
|
2049
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2050
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2051
|
-
q8 += 8; a += 8;
|
|
2052
|
-
}
|
|
2053
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2054
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2055
|
-
const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2056
|
-
sumf -= dmin * sumi;
|
|
2057
|
-
}
|
|
2058
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2059
|
-
*s = sumf;
|
|
1908
|
+
UNUSED(x);
|
|
1909
|
+
UNUSED(y);
|
|
1910
|
+
UNUSED(nb);
|
|
1911
|
+
UNUSED(kmask1);
|
|
1912
|
+
UNUSED(kmask2);
|
|
1913
|
+
UNUSED(kmask3);
|
|
1914
|
+
UNUSED(utmp);
|
|
1915
|
+
wsp_ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2060
1916
|
#endif
|
|
2061
1917
|
}
|
|
2062
1918
|
|
|
@@ -2259,66 +2115,14 @@ void wsp_ggml_vec_dot_q5_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
2259
2115
|
*s = hsum_float_8(acc) + summs;
|
|
2260
2116
|
|
|
2261
2117
|
#else
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2271
|
-
|
|
2272
|
-
float sumf = 0;
|
|
2273
|
-
for (int i = 0; i < nb; ++i) {
|
|
2274
|
-
const uint8_t * WSP_GGML_RESTRICT q4 = x[i].qs;
|
|
2275
|
-
const uint8_t * WSP_GGML_RESTRICT hm = x[i].qh;
|
|
2276
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2277
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2278
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
2279
|
-
uint8_t m = 1;
|
|
2280
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2281
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2282
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2283
|
-
a += 32; m <<= 1;
|
|
2284
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2285
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2286
|
-
a += 32; m <<= 1;
|
|
2287
|
-
q4 += 32;
|
|
2288
|
-
}
|
|
2289
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2290
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2291
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2292
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2293
|
-
utmp[2] = uaux;
|
|
2294
|
-
utmp[0] &= kmask1;
|
|
2295
|
-
|
|
2296
|
-
int sumi = 0;
|
|
2297
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2298
|
-
a = aux8;
|
|
2299
|
-
int is = 0;
|
|
2300
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2301
|
-
int32_t scale = scales[is++];
|
|
2302
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2303
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2304
|
-
q8 += 8; a += 8;
|
|
2305
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2306
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2307
|
-
q8 += 8; a += 8;
|
|
2308
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2309
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2310
|
-
q8 += 8; a += 8;
|
|
2311
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2312
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2313
|
-
q8 += 8; a += 8;
|
|
2314
|
-
}
|
|
2315
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2316
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2317
|
-
const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2318
|
-
sumf -= dmin * sumi;
|
|
2319
|
-
}
|
|
2320
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2321
|
-
*s = sumf;
|
|
2118
|
+
UNUSED(x);
|
|
2119
|
+
UNUSED(y);
|
|
2120
|
+
UNUSED(nb);
|
|
2121
|
+
UNUSED(kmask1);
|
|
2122
|
+
UNUSED(kmask2);
|
|
2123
|
+
UNUSED(kmask3);
|
|
2124
|
+
UNUSED(utmp);
|
|
2125
|
+
wsp_ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2322
2126
|
#endif
|
|
2323
2127
|
}
|
|
2324
2128
|
|
|
@@ -2520,47 +2324,10 @@ void wsp_ggml_vec_dot_q6_K_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, c
|
|
|
2520
2324
|
*s = hsum_float_8(acc);
|
|
2521
2325
|
|
|
2522
2326
|
#else
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
int32_t aux32[8];
|
|
2528
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2529
|
-
|
|
2530
|
-
float sumf = 0;
|
|
2531
|
-
for (int i = 0; i < nb; ++i) {
|
|
2532
|
-
const uint8_t * WSP_GGML_RESTRICT q4 = x[i].ql;
|
|
2533
|
-
const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
|
|
2534
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2535
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2536
|
-
int8_t * WSP_GGML_RESTRICT a = aux8;
|
|
2537
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
2538
|
-
for (int l = 0; l < 32; ++l) {
|
|
2539
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
2540
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
2541
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
2542
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
2543
|
-
}
|
|
2544
|
-
a += 128;
|
|
2545
|
-
q4 += 64;
|
|
2546
|
-
qh += 32;
|
|
2547
|
-
}
|
|
2548
|
-
a = aux8;
|
|
2549
|
-
int is = 0;
|
|
2550
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
2551
|
-
int scale = x[i].scales[is++];
|
|
2552
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2553
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2554
|
-
q8 += 8; a += 8;
|
|
2555
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2556
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2557
|
-
q8 += 8; a += 8;
|
|
2558
|
-
}
|
|
2559
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2560
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2561
|
-
}
|
|
2562
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2563
|
-
*s = sumf;
|
|
2327
|
+
UNUSED(x);
|
|
2328
|
+
UNUSED(y);
|
|
2329
|
+
UNUSED(nb);
|
|
2330
|
+
wsp_ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2564
2331
|
#endif
|
|
2565
2332
|
}
|
|
2566
2333
|
|
|
@@ -2712,34 +2479,10 @@ void wsp_ggml_vec_dot_iq2_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs
|
|
|
2712
2479
|
*s = 0.125f * hsum_float_8(accumf);
|
|
2713
2480
|
|
|
2714
2481
|
#else
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
float sumf = 0.f;
|
|
2720
|
-
for (int i = 0; i < nb; ++i) {
|
|
2721
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2722
|
-
const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
|
|
2723
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
2724
|
-
int32_t bsum = 0;
|
|
2725
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2726
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
2727
|
-
q2 += 4;
|
|
2728
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
2729
|
-
int32_t sumi = 0;
|
|
2730
|
-
for (int l = 0; l < 4; ++l) {
|
|
2731
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
2732
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
2733
|
-
for (int j = 0; j < 8; ++j) {
|
|
2734
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
2735
|
-
}
|
|
2736
|
-
q8 += 8;
|
|
2737
|
-
}
|
|
2738
|
-
bsum += sumi * ls;
|
|
2739
|
-
}
|
|
2740
|
-
sumf += d * bsum;
|
|
2741
|
-
}
|
|
2742
|
-
*s = 0.125f * sumf;
|
|
2482
|
+
UNUSED(x);
|
|
2483
|
+
UNUSED(y);
|
|
2484
|
+
UNUSED(nb);
|
|
2485
|
+
wsp_ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2743
2486
|
#endif
|
|
2744
2487
|
}
|
|
2745
2488
|
|
|
@@ -3033,42 +2776,10 @@ void wsp_ggml_vec_dot_iq2_xs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3033
2776
|
*s = 0.125f * hsum_float_8(accumf);
|
|
3034
2777
|
|
|
3035
2778
|
#else
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3040
|
-
const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
|
|
3041
|
-
const uint8_t * WSP_GGML_RESTRICT sc = x[i].scales;
|
|
3042
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3043
|
-
int32_t bsum = 0;
|
|
3044
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3045
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
3046
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
3047
|
-
int32_t sumi = 0;
|
|
3048
|
-
for (int l = 0; l < 2; ++l) {
|
|
3049
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3050
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3051
|
-
for (int j = 0; j < 8; ++j) {
|
|
3052
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3053
|
-
}
|
|
3054
|
-
q8 += 8;
|
|
3055
|
-
}
|
|
3056
|
-
bsum += sumi * ls1;
|
|
3057
|
-
sumi = 0;
|
|
3058
|
-
for (int l = 2; l < 4; ++l) {
|
|
3059
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3060
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3061
|
-
for (int j = 0; j < 8; ++j) {
|
|
3062
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3063
|
-
}
|
|
3064
|
-
q8 += 8;
|
|
3065
|
-
}
|
|
3066
|
-
bsum += sumi * ls2;
|
|
3067
|
-
q2 += 4;
|
|
3068
|
-
}
|
|
3069
|
-
sumf += d * bsum;
|
|
3070
|
-
}
|
|
3071
|
-
*s = 0.125f * sumf;
|
|
2779
|
+
UNUSED(x);
|
|
2780
|
+
UNUSED(y);
|
|
2781
|
+
UNUSED(nb);
|
|
2782
|
+
wsp_ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3072
2783
|
#endif
|
|
3073
2784
|
}
|
|
3074
2785
|
|
|
@@ -3250,47 +2961,11 @@ void wsp_ggml_vec_dot_iq2_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3250
2961
|
*s = 0.125f * hsum_float_8(accumf);
|
|
3251
2962
|
|
|
3252
2963
|
#else
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3258
|
-
const int8_t * q8 = y[i].qs;
|
|
3259
|
-
const uint8_t * qs = x[i].qs;
|
|
3260
|
-
const uint8_t * qh = x[i].qh;
|
|
3261
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
3262
|
-
|
|
3263
|
-
int bsum = 0;
|
|
3264
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3265
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
3266
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
3267
|
-
int sumi1 = 0, sumi2 = 0;
|
|
3268
|
-
for (int l = 0; l < 2; ++l) {
|
|
3269
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3270
|
-
for (int j = 0; j < 8; ++j) {
|
|
3271
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3272
|
-
}
|
|
3273
|
-
q8 += 8;
|
|
3274
|
-
}
|
|
3275
|
-
for (int l = 2; l < 4; ++l) {
|
|
3276
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3277
|
-
for (int j = 0; j < 8; ++j) {
|
|
3278
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3279
|
-
}
|
|
3280
|
-
q8 += 8;
|
|
3281
|
-
}
|
|
3282
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
3283
|
-
qs += 4;
|
|
3284
|
-
signs += 4;
|
|
3285
|
-
}
|
|
3286
|
-
|
|
3287
|
-
sumf += d * bsum;
|
|
3288
|
-
}
|
|
3289
|
-
|
|
3290
|
-
*s = 0.125f * sumf;
|
|
3291
|
-
|
|
2964
|
+
UNUSED(x);
|
|
2965
|
+
UNUSED(y);
|
|
2966
|
+
UNUSED(nb);
|
|
2967
|
+
wsp_ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3292
2968
|
#endif
|
|
3293
|
-
|
|
3294
2969
|
}
|
|
3295
2970
|
|
|
3296
2971
|
void wsp_ggml_vec_dot_iq3_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -3410,36 +3085,10 @@ void wsp_ggml_vec_dot_iq3_xxs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs
|
|
|
3410
3085
|
*s = 0.25f * hsum_float_8(accumf);
|
|
3411
3086
|
|
|
3412
3087
|
#else
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
for (int i = 0; i < nb; ++i) {
|
|
3418
|
-
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3419
|
-
const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
|
|
3420
|
-
const uint8_t * WSP_GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3421
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3422
|
-
int32_t bsum = 0;
|
|
3423
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3424
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
3425
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
3426
|
-
int32_t sumi = 0;
|
|
3427
|
-
for (int l = 0; l < 4; ++l) {
|
|
3428
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
3429
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
3430
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
3431
|
-
for (int j = 0; j < 4; ++j) {
|
|
3432
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3433
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3434
|
-
}
|
|
3435
|
-
q8 += 8;
|
|
3436
|
-
}
|
|
3437
|
-
q3 += 8;
|
|
3438
|
-
bsum += sumi * ls;
|
|
3439
|
-
}
|
|
3440
|
-
sumf += d * bsum;
|
|
3441
|
-
}
|
|
3442
|
-
*s = 0.25f * sumf;
|
|
3088
|
+
UNUSED(x);
|
|
3089
|
+
UNUSED(y);
|
|
3090
|
+
UNUSED(nb);
|
|
3091
|
+
wsp_ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3443
3092
|
#endif
|
|
3444
3093
|
}
|
|
3445
3094
|
|
|
@@ -3646,59 +3295,13 @@ void wsp_ggml_vec_dot_iq3_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3646
3295
|
*s = hsum_float_8(accumf);
|
|
3647
3296
|
|
|
3648
3297
|
#else
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
const uint8_t * WSP_GGML_RESTRICT qs = x[i].qs;
|
|
3654
|
-
const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
|
|
3655
|
-
const uint8_t * WSP_GGML_RESTRICT signs = x[i].signs;
|
|
3656
|
-
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
3657
|
-
int32_t bsum = 0;
|
|
3658
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
3659
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
3660
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
3661
|
-
int32_t sumi = 0;
|
|
3662
|
-
for (int l = 0; l < 4; ++l) {
|
|
3663
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
3664
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
3665
|
-
for (int j = 0; j < 4; ++j) {
|
|
3666
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3667
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3668
|
-
}
|
|
3669
|
-
q8 += 8;
|
|
3670
|
-
}
|
|
3671
|
-
qs += 8;
|
|
3672
|
-
signs += 4;
|
|
3673
|
-
bsum += sumi * ls1;
|
|
3674
|
-
sumi = 0;
|
|
3675
|
-
for (int l = 0; l < 4; ++l) {
|
|
3676
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
3677
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
3678
|
-
for (int j = 0; j < 4; ++j) {
|
|
3679
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3680
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3681
|
-
}
|
|
3682
|
-
q8 += 8;
|
|
3683
|
-
}
|
|
3684
|
-
qs += 8;
|
|
3685
|
-
signs += 4;
|
|
3686
|
-
bsum += sumi * ls2;
|
|
3687
|
-
}
|
|
3688
|
-
sumf += d * bsum;
|
|
3689
|
-
}
|
|
3690
|
-
*s = sumf;
|
|
3298
|
+
UNUSED(x);
|
|
3299
|
+
UNUSED(y);
|
|
3300
|
+
UNUSED(nb);
|
|
3301
|
+
wsp_ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3691
3302
|
#endif
|
|
3692
3303
|
}
|
|
3693
3304
|
|
|
3694
|
-
#if defined(__AVX2__)
|
|
3695
|
-
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
3696
|
-
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
3697
|
-
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
3698
|
-
return _mm256_maddubs_epi16(ax, sy);
|
|
3699
|
-
}
|
|
3700
|
-
#endif
|
|
3701
|
-
|
|
3702
3305
|
void wsp_ggml_vec_dot_iq1_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, size_t bx, const void * WSP_GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3703
3306
|
assert(n % QK_K == 0);
|
|
3704
3307
|
assert(nrc == 1);
|
|
@@ -3811,36 +3414,10 @@ void wsp_ggml_vec_dot_iq1_s_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3811
3414
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
|
3812
3415
|
|
|
3813
3416
|
#else
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
3817
|
-
|
|
3818
|
-
const int8_t * q8 = y[i].qs;
|
|
3819
|
-
const uint8_t * qs = x[i].qs;
|
|
3820
|
-
const uint16_t * qh = x[i].qh;
|
|
3821
|
-
|
|
3822
|
-
int sumi = 0, sumi1 = 0;
|
|
3823
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
3824
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
3825
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
3826
|
-
int lsum = 0;
|
|
3827
|
-
for (int l = 0; l < 4; ++l) {
|
|
3828
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
3829
|
-
for (int j = 0; j < 8; ++j) {
|
|
3830
|
-
lsum += q8[j] * grid[j];
|
|
3831
|
-
}
|
|
3832
|
-
q8 += 8;
|
|
3833
|
-
}
|
|
3834
|
-
sumi += ls * lsum;
|
|
3835
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
3836
|
-
qs += 4;
|
|
3837
|
-
}
|
|
3838
|
-
|
|
3839
|
-
sumf += WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
3840
|
-
}
|
|
3841
|
-
|
|
3842
|
-
*s = sumf;
|
|
3843
|
-
|
|
3417
|
+
UNUSED(x);
|
|
3418
|
+
UNUSED(y);
|
|
3419
|
+
UNUSED(nb);
|
|
3420
|
+
wsp_ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3844
3421
|
#endif
|
|
3845
3422
|
}
|
|
3846
3423
|
|
|
@@ -4043,52 +3620,11 @@ void wsp_ggml_vec_dot_iq1_m_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
4043
3620
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
|
4044
3621
|
|
|
4045
3622
|
#else
|
|
4046
|
-
|
|
4047
|
-
|
|
4048
|
-
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
|
|
4052
|
-
const int8_t * q8 = y[i].qs;
|
|
4053
|
-
const uint8_t * qs = x[i].qs;
|
|
4054
|
-
const uint8_t * qh = x[i].qh;
|
|
4055
|
-
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
4056
|
-
|
|
4057
|
-
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
4058
|
-
|
|
4059
|
-
int sumi1 = 0, sumi2 = 0;
|
|
4060
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
4061
|
-
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
|
4062
|
-
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
|
4063
|
-
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
|
4064
|
-
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
|
4065
|
-
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
|
4066
|
-
for (int l = 0; l < 4; ++l) {
|
|
4067
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
|
4068
|
-
int lsum1 = 0, lsum2 = 0;
|
|
4069
|
-
for (int j = 0; j < 8; ++j) {
|
|
4070
|
-
lsum1 += q8[j] * grid[j];
|
|
4071
|
-
lsum2 += q8[j];
|
|
4072
|
-
}
|
|
4073
|
-
q8 += 8;
|
|
4074
|
-
sum1[l/2] += lsum1;
|
|
4075
|
-
sum2[l/2] += lsum2*delta[l];
|
|
4076
|
-
}
|
|
4077
|
-
|
|
4078
|
-
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
|
4079
|
-
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
|
4080
|
-
|
|
4081
|
-
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
|
4082
|
-
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
|
4083
|
-
qs += 4;
|
|
4084
|
-
qh += 2;
|
|
4085
|
-
}
|
|
4086
|
-
|
|
4087
|
-
sumf += WSP_GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
4088
|
-
}
|
|
4089
|
-
|
|
4090
|
-
*s = sumf;
|
|
4091
|
-
|
|
3623
|
+
UNUSED(x);
|
|
3624
|
+
UNUSED(y);
|
|
3625
|
+
UNUSED(nb);
|
|
3626
|
+
UNUSED(scale);
|
|
3627
|
+
wsp_ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
4092
3628
|
#endif
|
|
4093
3629
|
}
|
|
4094
3630
|
|
|
@@ -4275,37 +3811,10 @@ void wsp_ggml_vec_dot_iq4_xs_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
4275
3811
|
*s = hsum_float_8(accum);
|
|
4276
3812
|
|
|
4277
3813
|
#else
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4282
|
-
const uint8_t * qs = x[ibl].qs;
|
|
4283
|
-
const int8_t * q8 = y[ibl].qs;
|
|
4284
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
4285
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
4286
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
4287
|
-
h >>= 4;
|
|
4288
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
4289
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
4290
|
-
int sumi1 = 0, sumi2 = 0;
|
|
4291
|
-
for (int j = 0; j < 16; ++j) {
|
|
4292
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4293
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4294
|
-
}
|
|
4295
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
4296
|
-
qs += 16;
|
|
4297
|
-
q8 += 32;
|
|
4298
|
-
sumi1 = sumi2 = 0;
|
|
4299
|
-
for (int j = 0; j < 16; ++j) {
|
|
4300
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4301
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4302
|
-
}
|
|
4303
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
4304
|
-
qs += 16;
|
|
4305
|
-
q8 += 32;
|
|
4306
|
-
}
|
|
4307
|
-
}
|
|
4308
|
-
*s = sumf;
|
|
3814
|
+
UNUSED(x);
|
|
3815
|
+
UNUSED(y);
|
|
3816
|
+
UNUSED(nb);
|
|
3817
|
+
wsp_ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
4309
3818
|
#endif
|
|
4310
3819
|
}
|
|
4311
3820
|
|