@fugood/llama.node 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +37 -0
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -256,9 +257,9 @@ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_
|
|
|
256
257
|
|
|
257
258
|
// quad fp16 delta calculation
|
|
258
259
|
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
|
|
259
|
-
//
|
|
260
|
-
return _mm256_set_m128(_mm_set1_ps(
|
|
261
|
-
_mm_set1_ps(
|
|
260
|
+
// GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
|
|
261
|
+
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
|
|
262
|
+
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
|
262
263
|
}
|
|
263
264
|
#endif
|
|
264
265
|
#elif defined(__SSSE3__)
|
|
@@ -305,7 +306,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
305
306
|
|
|
306
307
|
// Quantize these floats
|
|
307
308
|
const float d = maxScalar / 127.f;
|
|
308
|
-
y[i].d =
|
|
309
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
309
310
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
|
310
311
|
const __m256 mul = _mm256_set1_ps( id );
|
|
311
312
|
|
|
@@ -401,7 +402,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
401
402
|
|
|
402
403
|
// Quantize these floats
|
|
403
404
|
const float d = max_scalar / 127.f;
|
|
404
|
-
y[i].d =
|
|
405
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
405
406
|
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
|
406
407
|
const __m256 mul = _mm256_set1_ps( id );
|
|
407
408
|
|
|
@@ -425,7 +426,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
425
426
|
|
|
426
427
|
#if defined(__AVX2__)
|
|
427
428
|
// Compute the sum of the quants and set y[i].s
|
|
428
|
-
y[i].s =
|
|
429
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
|
429
430
|
|
|
430
431
|
// Convert int32 to int16
|
|
431
432
|
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
|
@@ -455,7 +456,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
455
456
|
// Compute the sum of the quants and set y[i].s
|
|
456
457
|
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
|
457
458
|
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
|
458
|
-
y[i].s =
|
|
459
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
|
459
460
|
|
|
460
461
|
// Convert int32 to int16
|
|
461
462
|
ni0 = _mm_packs_epi32( ni0, ni1 );
|
|
@@ -552,7 +553,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
552
553
|
// Main loop
|
|
553
554
|
for (; ib < nb; ++ib) {
|
|
554
555
|
/* Compute combined scale for the block */
|
|
555
|
-
const __m256 d = _mm256_set1_ps(
|
|
556
|
+
const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
|
556
557
|
|
|
557
558
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
558
559
|
|
|
@@ -613,7 +614,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
613
614
|
_mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
|
|
614
615
|
|
|
615
616
|
// Compute combined scale for the block 0 and 1
|
|
616
|
-
const __m128 d_0_1 = _mm_set1_ps(
|
|
617
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
|
617
618
|
|
|
618
619
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
|
619
620
|
|
|
@@ -631,7 +632,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
631
632
|
_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
|
632
633
|
|
|
633
634
|
// Compute combined scale for the block 2 and 3
|
|
634
|
-
const __m128 d_2_3 = _mm_set1_ps(
|
|
635
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
|
|
635
636
|
|
|
636
637
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
637
638
|
|
|
@@ -680,7 +681,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
680
681
|
}
|
|
681
682
|
|
|
682
683
|
int sumi = sumi0 + sumi1;
|
|
683
|
-
sumf += sumi*
|
|
684
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
684
685
|
}
|
|
685
686
|
|
|
686
687
|
*s = sumf;
|
|
@@ -711,10 +712,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
711
712
|
|
|
712
713
|
// Main loop
|
|
713
714
|
for (; ib < nb; ++ib) {
|
|
714
|
-
const float d0 =
|
|
715
|
-
const float d1 =
|
|
715
|
+
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
716
|
+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
716
717
|
|
|
717
|
-
summs +=
|
|
718
|
+
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
718
719
|
|
|
719
720
|
const __m256 d0v = _mm256_set1_ps( d0 );
|
|
720
721
|
const __m256 d1v = _mm256_set1_ps( d1 );
|
|
@@ -752,7 +753,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
752
753
|
}
|
|
753
754
|
|
|
754
755
|
int sumi = sumi0 + sumi1;
|
|
755
|
-
sumf += (
|
|
756
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
756
757
|
}
|
|
757
758
|
|
|
758
759
|
*s = sumf;
|
|
@@ -783,7 +784,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
783
784
|
// Main loop
|
|
784
785
|
for (; ib < nb; ++ib) {
|
|
785
786
|
/* Compute combined scale for the block */
|
|
786
|
-
const __m256 d = _mm256_set1_ps(
|
|
787
|
+
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
787
788
|
|
|
788
789
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
789
790
|
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
@@ -807,7 +808,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
807
808
|
// Main loop
|
|
808
809
|
for (; ib < nb; ++ib) {
|
|
809
810
|
/* Compute combined scale for the block */
|
|
810
|
-
const __m256 d = _mm256_set1_ps(
|
|
811
|
+
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
811
812
|
|
|
812
813
|
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
|
|
813
814
|
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
@@ -851,7 +852,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
851
852
|
}
|
|
852
853
|
|
|
853
854
|
int sumi = sumi0 + sumi1;
|
|
854
|
-
sumf += (
|
|
855
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
855
856
|
}
|
|
856
857
|
|
|
857
858
|
*s = sumf;
|
|
@@ -883,16 +884,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
883
884
|
|
|
884
885
|
// Main loop
|
|
885
886
|
for (; ib < nb; ++ib) {
|
|
886
|
-
const __m256 dx = _mm256_set1_ps(
|
|
887
|
+
const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
887
888
|
|
|
888
|
-
summs +=
|
|
889
|
+
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
889
890
|
|
|
890
891
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
891
892
|
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
892
893
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
|
893
894
|
qx = _mm256_or_si256(qx, bxhi);
|
|
894
895
|
|
|
895
|
-
const __m256 dy = _mm256_set1_ps(
|
|
896
|
+
const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
896
897
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
897
898
|
|
|
898
899
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
|
@@ -910,9 +911,9 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
910
911
|
|
|
911
912
|
// Main loop
|
|
912
913
|
for (; ib < nb; ++ib) {
|
|
913
|
-
const __m256 dx = _mm256_set1_ps(
|
|
914
|
+
const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
914
915
|
|
|
915
|
-
summs +=
|
|
916
|
+
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
916
917
|
|
|
917
918
|
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
|
|
918
919
|
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
@@ -926,7 +927,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
926
927
|
bxh = _mm_or_si128(bxh, bxhih);
|
|
927
928
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
|
928
929
|
|
|
929
|
-
const __m256 dy = _mm256_set1_ps(
|
|
930
|
+
const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
930
931
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
931
932
|
|
|
932
933
|
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
|
@@ -956,7 +957,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
956
957
|
}
|
|
957
958
|
|
|
958
959
|
int sumi = sumi0 + sumi1;
|
|
959
|
-
sumf += (
|
|
960
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
960
961
|
}
|
|
961
962
|
|
|
962
963
|
*s = sumf;
|
|
@@ -986,7 +987,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
986
987
|
// Main loop
|
|
987
988
|
for (; ib < nb; ++ib) {
|
|
988
989
|
// Compute combined scale for the block
|
|
989
|
-
const __m256 d = _mm256_set1_ps(
|
|
990
|
+
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
990
991
|
__m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
|
|
991
992
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
992
993
|
|
|
@@ -1025,7 +1026,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1025
1026
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
1026
1027
|
}
|
|
1027
1028
|
|
|
1028
|
-
sumf += sumi*(
|
|
1029
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
1029
1030
|
}
|
|
1030
1031
|
|
|
1031
1032
|
*s = sumf;
|
|
@@ -1144,7 +1145,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1144
1145
|
}
|
|
1145
1146
|
|
|
1146
1147
|
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
|
1147
|
-
const __m256 d = _mm256_set1_ps(y[i].d *
|
|
1148
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1148
1149
|
|
|
1149
1150
|
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
|
1150
1151
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
|
|
@@ -1190,7 +1191,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1190
1191
|
}
|
|
1191
1192
|
}
|
|
1192
1193
|
|
|
1193
|
-
sumf += (float) sum * (
|
|
1194
|
+
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
1194
1195
|
}
|
|
1195
1196
|
|
|
1196
1197
|
*s = sumf;
|
|
@@ -1244,7 +1245,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1244
1245
|
}
|
|
1245
1246
|
|
|
1246
1247
|
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
|
1247
|
-
const __m256 d = _mm256_set1_ps(y[i].d *
|
|
1248
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
|
|
1248
1249
|
|
|
1249
1250
|
sumi0 = _mm256_add_epi16(sumi0, sumi1);
|
|
1250
1251
|
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
|
@@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1269
1270
|
}
|
|
1270
1271
|
}
|
|
1271
1272
|
|
|
1272
|
-
const float d = y[i].d *
|
|
1273
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1273
1274
|
|
|
1274
1275
|
sumf += (float) sumi * d;
|
|
1275
1276
|
}
|
|
@@ -1299,8 +1300,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1299
1300
|
|
|
1300
1301
|
for (int i = 0; i < nb; ++i) {
|
|
1301
1302
|
|
|
1302
|
-
const float d = y[i].d *
|
|
1303
|
-
const float dmin = -y[i].d *
|
|
1303
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1304
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1304
1305
|
|
|
1305
1306
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1306
1307
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1366,8 +1367,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1366
1367
|
|
|
1367
1368
|
for (int i = 0; i < nb; ++i) {
|
|
1368
1369
|
|
|
1369
|
-
const float dall = y[i].d *
|
|
1370
|
-
const float dmin = -y[i].d *
|
|
1370
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1371
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1371
1372
|
|
|
1372
1373
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1373
1374
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1477,8 +1478,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1477
1478
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1478
1479
|
}
|
|
1479
1480
|
|
|
1480
|
-
const float dall = y[i].d *
|
|
1481
|
-
const float dmin = y[i].d *
|
|
1481
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1482
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1482
1483
|
|
|
1483
1484
|
int isum = 0;
|
|
1484
1485
|
int is = 0;
|
|
@@ -1533,7 +1534,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1533
1534
|
|
|
1534
1535
|
for (int i = 0; i < nb; ++i) {
|
|
1535
1536
|
|
|
1536
|
-
const float d = y[i].d *
|
|
1537
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1537
1538
|
|
|
1538
1539
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
1539
1540
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1638,7 +1639,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1638
1639
|
|
|
1639
1640
|
for (int i = 0; i < nb; ++i) {
|
|
1640
1641
|
|
|
1641
|
-
const float d = y[i].d *
|
|
1642
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1642
1643
|
|
|
1643
1644
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
1644
1645
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1824,7 +1825,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1824
1825
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1825
1826
|
q8 += 8; a += 8;
|
|
1826
1827
|
}
|
|
1827
|
-
const float d =
|
|
1828
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1828
1829
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1829
1830
|
}
|
|
1830
1831
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1862,8 +1863,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1862
1863
|
|
|
1863
1864
|
for (int i = 0; i < nb; ++i) {
|
|
1864
1865
|
|
|
1865
|
-
const float d = y[i].d *
|
|
1866
|
-
const float dmin = -y[i].d *
|
|
1866
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1867
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1867
1868
|
|
|
1868
1869
|
memcpy(utmp, x[i].scales, 12);
|
|
1869
1870
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -1928,8 +1929,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1928
1929
|
|
|
1929
1930
|
for (int i = 0; i < nb; ++i) {
|
|
1930
1931
|
|
|
1931
|
-
const float d = y[i].d *
|
|
1932
|
-
const float dmin = -y[i].d *
|
|
1932
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1933
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1933
1934
|
|
|
1934
1935
|
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1935
1936
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -2049,9 +2050,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2049
2050
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2050
2051
|
q8 += 8; a += 8;
|
|
2051
2052
|
}
|
|
2052
|
-
const float d =
|
|
2053
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2053
2054
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2054
|
-
const float dmin =
|
|
2055
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2055
2056
|
sumf -= dmin * sumi;
|
|
2056
2057
|
}
|
|
2057
2058
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -2092,8 +2093,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2092
2093
|
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
2093
2094
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2094
2095
|
|
|
2095
|
-
const float d = y[i].d *
|
|
2096
|
-
const float dmin = -y[i].d *
|
|
2096
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2097
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
2097
2098
|
|
|
2098
2099
|
memcpy(utmp, x[i].scales, 12);
|
|
2099
2100
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -2170,8 +2171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2170
2171
|
|
|
2171
2172
|
for (int i = 0; i < nb; ++i) {
|
|
2172
2173
|
|
|
2173
|
-
const float d = y[i].d *
|
|
2174
|
-
const float dmin = -y[i].d *
|
|
2174
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2175
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
2175
2176
|
|
|
2176
2177
|
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
2177
2178
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -2311,9 +2312,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2311
2312
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2312
2313
|
q8 += 8; a += 8;
|
|
2313
2314
|
}
|
|
2314
|
-
const float d =
|
|
2315
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2315
2316
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2316
|
-
const float dmin =
|
|
2317
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2317
2318
|
sumf -= dmin * sumi;
|
|
2318
2319
|
}
|
|
2319
2320
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -2344,7 +2345,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2344
2345
|
|
|
2345
2346
|
for (int i = 0; i < nb; ++i) {
|
|
2346
2347
|
|
|
2347
|
-
const float d = y[i].d *
|
|
2348
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2348
2349
|
|
|
2349
2350
|
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
2350
2351
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -2422,7 +2423,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2422
2423
|
|
|
2423
2424
|
for (int i = 0; i < nb; ++i) {
|
|
2424
2425
|
|
|
2425
|
-
const float d = y[i].d *
|
|
2426
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2426
2427
|
|
|
2427
2428
|
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
2428
2429
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -2555,7 +2556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2555
2556
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2556
2557
|
q8 += 8; a += 8;
|
|
2557
2558
|
}
|
|
2558
|
-
const float d =
|
|
2559
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2559
2560
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2560
2561
|
}
|
|
2561
2562
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -2622,7 +2623,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2622
2623
|
|
|
2623
2624
|
__m256 accumf = _mm256_setzero_ps();
|
|
2624
2625
|
for (int i = 0; i < nb; ++i) {
|
|
2625
|
-
const float d =
|
|
2626
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2626
2627
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
2627
2628
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2628
2629
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
@@ -2663,7 +2664,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2663
2664
|
|
|
2664
2665
|
__m256 accumf = _mm256_setzero_ps();
|
|
2665
2666
|
for (int i = 0; i < nb; ++i) {
|
|
2666
|
-
const float d =
|
|
2667
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2667
2668
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
2668
2669
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2669
2670
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
@@ -2717,7 +2718,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2717
2718
|
|
|
2718
2719
|
float sumf = 0.f;
|
|
2719
2720
|
for (int i = 0; i < nb; ++i) {
|
|
2720
|
-
const float d =
|
|
2721
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2721
2722
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
2722
2723
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2723
2724
|
int32_t bsum = 0;
|
|
@@ -2792,7 +2793,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2792
2793
|
|
|
2793
2794
|
__m256 accumf = _mm256_setzero_ps();
|
|
2794
2795
|
for (int i = 0; i < nb; ++i) {
|
|
2795
|
-
const float d =
|
|
2796
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2796
2797
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
2797
2798
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2798
2799
|
|
|
@@ -2913,7 +2914,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2913
2914
|
|
|
2914
2915
|
__m256 accumf = _mm256_setzero_ps();
|
|
2915
2916
|
for (int i = 0; i < nb; ++i) {
|
|
2916
|
-
const float d =
|
|
2917
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2917
2918
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
2918
2919
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2919
2920
|
|
|
@@ -3035,7 +3036,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
3035
3036
|
|
|
3036
3037
|
float sumf = 0.f;
|
|
3037
3038
|
for (int i = 0; i < nb; ++i) {
|
|
3038
|
-
const float d =
|
|
3039
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3039
3040
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3040
3041
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
3041
3042
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3104,7 +3105,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3104
3105
|
|
|
3105
3106
|
__m256 accumf = _mm256_setzero_ps();
|
|
3106
3107
|
for (int i = 0; i < nb; ++i) {
|
|
3107
|
-
const float d =
|
|
3108
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3108
3109
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3109
3110
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3110
3111
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
@@ -3177,7 +3178,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3177
3178
|
|
|
3178
3179
|
__m256 accumf = _mm256_setzero_ps();
|
|
3179
3180
|
for (int i = 0; i < nb; ++i) {
|
|
3180
|
-
const float d =
|
|
3181
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3181
3182
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3182
3183
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3183
3184
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
@@ -3253,7 +3254,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3253
3254
|
float sumf = 0;
|
|
3254
3255
|
for (int i = 0; i < nb; i++) {
|
|
3255
3256
|
|
|
3256
|
-
const float d =
|
|
3257
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3257
3258
|
const int8_t * q8 = y[i].qs;
|
|
3258
3259
|
const uint8_t * qs = x[i].qs;
|
|
3259
3260
|
const uint8_t * qh = x[i].qh;
|
|
@@ -3313,7 +3314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3313
3314
|
|
|
3314
3315
|
__m256 accumf = _mm256_setzero_ps();
|
|
3315
3316
|
for (int i = 0; i < nb; ++i) {
|
|
3316
|
-
const float d =
|
|
3317
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3317
3318
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3318
3319
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3319
3320
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3358,7 +3359,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3358
3359
|
|
|
3359
3360
|
__m256 accumf = _mm256_setzero_ps();
|
|
3360
3361
|
for (int i = 0; i < nb; ++i) {
|
|
3361
|
-
const float d =
|
|
3362
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3362
3363
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3363
3364
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3364
3365
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3414,7 +3415,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3414
3415
|
|
|
3415
3416
|
float sumf = 0.f;
|
|
3416
3417
|
for (int i = 0; i < nb; ++i) {
|
|
3417
|
-
const float d =
|
|
3418
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3418
3419
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3419
3420
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3420
3421
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -3480,7 +3481,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3480
3481
|
|
|
3481
3482
|
__m256 accumf = _mm256_setzero_ps();
|
|
3482
3483
|
for (int i = 0; i < nb; ++i) {
|
|
3483
|
-
const float d =
|
|
3484
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3484
3485
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3485
3486
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3486
3487
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
@@ -3565,7 +3566,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3565
3566
|
|
|
3566
3567
|
__m256 accumf = _mm256_setzero_ps();
|
|
3567
3568
|
for (int i = 0; i < nb; ++i) {
|
|
3568
|
-
const float d =
|
|
3569
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3569
3570
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3570
3571
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3571
3572
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
@@ -3648,7 +3649,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3648
3649
|
|
|
3649
3650
|
float sumf = 0.f;
|
|
3650
3651
|
for (int i = 0; i < nb; ++i) {
|
|
3651
|
-
const float d =
|
|
3652
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3652
3653
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3653
3654
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3654
3655
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
@@ -3753,7 +3754,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3753
3754
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
|
3754
3755
|
}
|
|
3755
3756
|
|
|
3756
|
-
const float d = y[i].d *
|
|
3757
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
3757
3758
|
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
|
|
3758
3759
|
accum1 += d * sumi1;
|
|
3759
3760
|
|
|
@@ -3801,7 +3802,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3801
3802
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
|
3802
3803
|
}
|
|
3803
3804
|
|
|
3804
|
-
const float d = y[i].d *
|
|
3805
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
3805
3806
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
|
3806
3807
|
accum1 += d * sumi1;
|
|
3807
3808
|
|
|
@@ -3835,7 +3836,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3835
3836
|
qs += 4;
|
|
3836
3837
|
}
|
|
3837
3838
|
|
|
3838
|
-
sumf +=
|
|
3839
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
3839
3840
|
}
|
|
3840
3841
|
|
|
3841
3842
|
*s = sumf;
|
|
@@ -3947,7 +3948,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3947
3948
|
qs += 8; qh += 4;
|
|
3948
3949
|
}
|
|
3949
3950
|
|
|
3950
|
-
const __m256 d = _mm256_set1_ps(y[i].d *
|
|
3951
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
|
|
3951
3952
|
|
|
3952
3953
|
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
|
|
3953
3954
|
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
|
|
@@ -4033,7 +4034,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
4033
4034
|
qs += 8; qh += 4;
|
|
4034
4035
|
}
|
|
4035
4036
|
|
|
4036
|
-
const __m256 d = _mm256_set1_ps(y[i].d *
|
|
4037
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
|
|
4037
4038
|
|
|
4038
4039
|
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
|
4039
4040
|
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
|
@@ -4083,7 +4084,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
4083
4084
|
qh += 2;
|
|
4084
4085
|
}
|
|
4085
4086
|
|
|
4086
|
-
sumf +=
|
|
4087
|
+
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
4087
4088
|
}
|
|
4088
4089
|
|
|
4089
4090
|
*s = sumf;
|
|
@@ -4129,9 +4130,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4129
4130
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
4130
4131
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
4131
4132
|
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
|
4132
|
-
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(
|
|
4133
|
+
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
|
|
4133
4134
|
_mm256_cvtepi32_ps(p_1), accum1);
|
|
4134
|
-
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(
|
|
4135
|
+
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
|
|
4135
4136
|
_mm256_cvtepi32_ps(p_2), accum2);
|
|
4136
4137
|
}
|
|
4137
4138
|
|
|
@@ -4164,7 +4165,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4164
4165
|
|
|
4165
4166
|
#endif
|
|
4166
4167
|
for (; ib < nb; ++ib) {
|
|
4167
|
-
const float d =
|
|
4168
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
4168
4169
|
int sumi1 = 0, sumi2 = 0;
|
|
4169
4170
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
4170
4171
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -4219,7 +4220,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4219
4220
|
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
|
4220
4221
|
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
|
4221
4222
|
}
|
|
4222
|
-
accum = _mm256_fmadd_ps(_mm256_set1_ps(
|
|
4223
|
+
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
|
4223
4224
|
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
|
4224
4225
|
}
|
|
4225
4226
|
|
|
@@ -4267,7 +4268,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4267
4268
|
}
|
|
4268
4269
|
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
|
4269
4270
|
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
|
4270
|
-
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(
|
|
4271
|
+
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
|
4271
4272
|
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
|
4272
4273
|
}
|
|
4273
4274
|
|
|
@@ -4276,7 +4277,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4276
4277
|
#else
|
|
4277
4278
|
float sumf = 0;
|
|
4278
4279
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
4279
|
-
const float d4d8 =
|
|
4280
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
4280
4281
|
uint16_t h = x[ibl].scales_h;
|
|
4281
4282
|
const uint8_t * qs = x[ibl].qs;
|
|
4282
4283
|
const int8_t * q8 = y[ibl].qs;
|