@fugood/llama.node 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +37 -0
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include <cmath>
|
|
@@ -51,7 +52,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
51
52
|
const float d = amax / ((1 << 7) - 1);
|
|
52
53
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
53
54
|
|
|
54
|
-
y[i].d[row_iter] =
|
|
55
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
55
56
|
}
|
|
56
57
|
|
|
57
58
|
for (int j = 0; j < 8; j++) {
|
|
@@ -102,7 +103,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
102
103
|
const float d = amax / ((1 << 7) - 1);
|
|
103
104
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
104
105
|
|
|
105
|
-
y[i].d[row_iter] =
|
|
106
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
106
107
|
}
|
|
107
108
|
|
|
108
109
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -145,7 +146,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
145
146
|
const float d = amax / ((1 << 7) - 1);
|
|
146
147
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
147
148
|
|
|
148
|
-
y[i].d[row_iter] =
|
|
149
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
149
150
|
}
|
|
150
151
|
|
|
151
152
|
for (int j = 0; j < 4; j++) {
|
|
@@ -221,7 +222,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
221
222
|
const float d = amax / ((1 << 7) - 1);
|
|
222
223
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
223
224
|
|
|
224
|
-
y[i].d[row_iter] =
|
|
225
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
225
226
|
}
|
|
226
227
|
|
|
227
228
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -311,7 +312,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
311
312
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
312
313
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
313
314
|
}
|
|
314
|
-
sumf[j] += sumi *
|
|
315
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
315
316
|
}
|
|
316
317
|
}
|
|
317
318
|
}
|
|
@@ -399,7 +400,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
399
400
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
400
401
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
401
402
|
}
|
|
402
|
-
sumf[j] += sumi *
|
|
403
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
403
404
|
}
|
|
404
405
|
}
|
|
405
406
|
}
|
|
@@ -514,7 +515,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
514
515
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
515
516
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
516
517
|
}
|
|
517
|
-
sumf[j] += sumi *
|
|
518
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
518
519
|
}
|
|
519
520
|
}
|
|
520
521
|
}
|
|
@@ -608,7 +609,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
608
609
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
609
610
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
610
611
|
}
|
|
611
|
-
sumf[j] += sumi *
|
|
612
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
612
613
|
}
|
|
613
614
|
}
|
|
614
615
|
}
|
|
@@ -1117,7 +1118,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1117
1118
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1118
1119
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1119
1120
|
}
|
|
1120
|
-
sumf[m][j] += sumi *
|
|
1121
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1121
1122
|
}
|
|
1122
1123
|
}
|
|
1123
1124
|
}
|
|
@@ -1570,7 +1571,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1570
1571
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1571
1572
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1572
1573
|
}
|
|
1573
|
-
sumf[m][j] += sumi *
|
|
1574
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1574
1575
|
}
|
|
1575
1576
|
}
|
|
1576
1577
|
}
|
|
@@ -2039,7 +2040,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2039
2040
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2040
2041
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
2041
2042
|
}
|
|
2042
|
-
sumf[m][j] += sumi *
|
|
2043
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
2043
2044
|
}
|
|
2044
2045
|
}
|
|
2045
2046
|
}
|
|
@@ -2147,7 +2148,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2147
2148
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2148
2149
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
2149
2150
|
}
|
|
2150
|
-
sumf[m][j] += sumi *
|
|
2151
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
2151
2152
|
}
|
|
2152
2153
|
}
|
|
2153
2154
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml-quants.h"
|
|
4
4
|
#include "ggml-impl.h"
|
|
5
5
|
#include "ggml-cpu.h"
|
|
6
|
+
#include "simd-mappings.h"
|
|
6
7
|
|
|
7
8
|
#include "../../quants.h"
|
|
8
9
|
#include "../../ggml-cpu-impl.h"
|
|
@@ -474,7 +475,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
474
475
|
|
|
475
476
|
// Quantize these floats
|
|
476
477
|
const float d = max_scalar / 127.f;
|
|
477
|
-
y[i].d =
|
|
478
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
478
479
|
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
|
479
480
|
const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
|
|
480
481
|
|
|
@@ -548,7 +549,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
548
549
|
|
|
549
550
|
// Quantize these floats
|
|
550
551
|
const float d = max_scalar / 127.f;
|
|
551
|
-
y[i].d =
|
|
552
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
552
553
|
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
|
553
554
|
const __m256 mul = __lasx_xvreplfr2vr_s( id );
|
|
554
555
|
|
|
@@ -576,7 +577,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
576
577
|
// Compute the sum of the quants and set y[i].s
|
|
577
578
|
const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
|
|
578
579
|
const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
|
|
579
|
-
y[i].s =
|
|
580
|
+
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
|
|
580
581
|
|
|
581
582
|
// Convert int32 to int16
|
|
582
583
|
ni0 = lsx_packs_w( ni0, ni1 );
|
|
@@ -667,7 +668,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
667
668
|
// Main loop
|
|
668
669
|
for (; ib < nb; ++ib) {
|
|
669
670
|
/* Compute combined scale for the block */
|
|
670
|
-
const __m256 d = __lasx_xvreplfr2vr_s(
|
|
671
|
+
const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
|
671
672
|
|
|
672
673
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
673
674
|
|
|
@@ -699,7 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
699
700
|
for (; ib + 1 < nb; ib += 2) {
|
|
700
701
|
|
|
701
702
|
// Compute combined scale for the block 0 and 1
|
|
702
|
-
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w(
|
|
703
|
+
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
|
703
704
|
|
|
704
705
|
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
|
|
705
706
|
|
|
@@ -717,7 +718,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
717
718
|
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
|
718
719
|
|
|
719
720
|
// Compute combined scale for the block 2 and 3
|
|
720
|
-
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w(
|
|
721
|
+
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
|
|
721
722
|
|
|
722
723
|
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
|
|
723
724
|
|
|
@@ -766,7 +767,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
766
767
|
}
|
|
767
768
|
|
|
768
769
|
int sumi = sumi0 + sumi1;
|
|
769
|
-
sumf += sumi*
|
|
770
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
770
771
|
}
|
|
771
772
|
|
|
772
773
|
*s = sumf;
|
|
@@ -797,10 +798,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
797
798
|
|
|
798
799
|
// Main loop
|
|
799
800
|
for (; ib < nb; ++ib) {
|
|
800
|
-
const float d0 =
|
|
801
|
-
const float d1 =
|
|
801
|
+
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
802
|
+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
802
803
|
|
|
803
|
-
summs +=
|
|
804
|
+
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
804
805
|
|
|
805
806
|
const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
|
|
806
807
|
const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
|
|
@@ -834,7 +835,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
834
835
|
}
|
|
835
836
|
|
|
836
837
|
int sumi = sumi0 + sumi1;
|
|
837
|
-
sumf += (
|
|
838
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
838
839
|
}
|
|
839
840
|
|
|
840
841
|
*s = sumf;
|
|
@@ -865,7 +866,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
865
866
|
// Main loop
|
|
866
867
|
for (; ib < nb; ++ib) {
|
|
867
868
|
/* Compute combined scale for the block */
|
|
868
|
-
const __m256 d = __lasx_xvreplfr2vr_s(
|
|
869
|
+
const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME
|
|
869
870
|
|
|
870
871
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
871
872
|
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
@@ -902,7 +903,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
902
903
|
}
|
|
903
904
|
|
|
904
905
|
int sumi = sumi0 + sumi1;
|
|
905
|
-
sumf += (
|
|
906
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
906
907
|
}
|
|
907
908
|
|
|
908
909
|
*s = sumf;
|
|
@@ -934,16 +935,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
934
935
|
|
|
935
936
|
// Main loop
|
|
936
937
|
for (; ib < nb; ++ib) {
|
|
937
|
-
const __m256 dx = __lasx_xvreplfr2vr_s(
|
|
938
|
+
const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
|
938
939
|
|
|
939
|
-
summs +=
|
|
940
|
+
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
940
941
|
|
|
941
942
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
942
943
|
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
943
944
|
bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
|
|
944
945
|
qx = __lasx_xvor_v(qx, bxhi);
|
|
945
946
|
|
|
946
|
-
const __m256 dy = __lasx_xvreplfr2vr_s(
|
|
947
|
+
const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
947
948
|
const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
|
|
948
949
|
|
|
949
950
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
|
@@ -973,7 +974,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
973
974
|
}
|
|
974
975
|
|
|
975
976
|
int sumi = sumi0 + sumi1;
|
|
976
|
-
sumf += (
|
|
977
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
977
978
|
}
|
|
978
979
|
|
|
979
980
|
*s = sumf;
|
|
@@ -1003,7 +1004,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1003
1004
|
// Main loop
|
|
1004
1005
|
for (; ib < nb; ++ib) {
|
|
1005
1006
|
// Compute combined scale for the block
|
|
1006
|
-
const __m256 d = __lasx_xvreplfr2vr_s(
|
|
1007
|
+
const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
1007
1008
|
__m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
|
|
1008
1009
|
__m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
|
|
1009
1010
|
|
|
@@ -1023,7 +1024,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1023
1024
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
1024
1025
|
}
|
|
1025
1026
|
|
|
1026
|
-
sumf += sumi*(
|
|
1027
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
1027
1028
|
}
|
|
1028
1029
|
|
|
1029
1030
|
*s = sumf;
|
|
@@ -1047,8 +1048,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1047
1048
|
|
|
1048
1049
|
for (int i = 0; i < nb; ++i) {
|
|
1049
1050
|
|
|
1050
|
-
const float d = y[i].d *
|
|
1051
|
-
const float dmin = -y[i].d *
|
|
1051
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1052
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1052
1053
|
|
|
1053
1054
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1054
1055
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -1116,8 +1117,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1116
1117
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1117
1118
|
}
|
|
1118
1119
|
|
|
1119
|
-
const float dall = y[i].d *
|
|
1120
|
-
const float dmin = y[i].d *
|
|
1120
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1121
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1121
1122
|
|
|
1122
1123
|
int isum = 0;
|
|
1123
1124
|
int is = 0;
|
|
@@ -1170,7 +1171,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1170
1171
|
|
|
1171
1172
|
for (int i = 0; i < nb; ++i) {
|
|
1172
1173
|
|
|
1173
|
-
const float d = y[i].d *
|
|
1174
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1174
1175
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
1175
1176
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1176
1177
|
// Set up scales
|
|
@@ -1294,7 +1295,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1294
1295
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
1295
1296
|
q8 += 8; a += 8;
|
|
1296
1297
|
}
|
|
1297
|
-
const float d =
|
|
1298
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1298
1299
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1299
1300
|
}
|
|
1300
1301
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1330,8 +1331,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1330
1331
|
|
|
1331
1332
|
for (int i = 0; i < nb; ++i) {
|
|
1332
1333
|
|
|
1333
|
-
const float d = y[i].d *
|
|
1334
|
-
const float dmin = -y[i].d *
|
|
1334
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1335
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1335
1336
|
|
|
1336
1337
|
memcpy(utmp, x[i].scales, 12);
|
|
1337
1338
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -1438,9 +1439,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1438
1439
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1439
1440
|
q8 += 8; a += 8;
|
|
1440
1441
|
}
|
|
1441
|
-
const float d =
|
|
1442
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1442
1443
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1443
|
-
const float dmin =
|
|
1444
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1444
1445
|
sumf -= dmin * sumi;
|
|
1445
1446
|
}
|
|
1446
1447
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1477,8 +1478,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1477
1478
|
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
1478
1479
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1479
1480
|
|
|
1480
|
-
const float d = y[i].d *
|
|
1481
|
-
const float dmin = -y[i].d *
|
|
1481
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1482
|
+
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1482
1483
|
|
|
1483
1484
|
memcpy(utmp, x[i].scales, 12);
|
|
1484
1485
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -1593,9 +1594,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1593
1594
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1594
1595
|
q8 += 8; a += 8;
|
|
1595
1596
|
}
|
|
1596
|
-
const float d =
|
|
1597
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1597
1598
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1598
|
-
const float dmin =
|
|
1599
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1599
1600
|
sumf -= dmin * sumi;
|
|
1600
1601
|
}
|
|
1601
1602
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1624,7 +1625,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1624
1625
|
|
|
1625
1626
|
for (int i = 0; i < nb; ++i) {
|
|
1626
1627
|
|
|
1627
|
-
const float d = y[i].d *
|
|
1628
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1628
1629
|
|
|
1629
1630
|
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
1630
1631
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
@@ -1713,7 +1714,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1713
1714
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1714
1715
|
q8 += 8; a += 8;
|
|
1715
1716
|
}
|
|
1716
|
-
const float d =
|
|
1717
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1717
1718
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1718
1719
|
}
|
|
1719
1720
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -1780,7 +1781,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1780
1781
|
|
|
1781
1782
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
1782
1783
|
for (int i = 0; i < nb; ++i) {
|
|
1783
|
-
const float d =
|
|
1784
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1784
1785
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1785
1786
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1786
1787
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
@@ -1820,7 +1821,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1820
1821
|
|
|
1821
1822
|
float sumf = 0.f;
|
|
1822
1823
|
for (int i = 0; i < nb; ++i) {
|
|
1823
|
-
const float d =
|
|
1824
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1824
1825
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1825
1826
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1826
1827
|
int32_t bsum = 0;
|
|
@@ -1895,7 +1896,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1895
1896
|
|
|
1896
1897
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
1897
1898
|
for (int i = 0; i < nb; ++i) {
|
|
1898
|
-
const float d =
|
|
1899
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1899
1900
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1900
1901
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1901
1902
|
|
|
@@ -1980,7 +1981,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1980
1981
|
|
|
1981
1982
|
float sumf = 0.f;
|
|
1982
1983
|
for (int i = 0; i < nb; ++i) {
|
|
1983
|
-
const float d =
|
|
1984
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1984
1985
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1985
1986
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
1986
1987
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -2049,7 +2050,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2049
2050
|
|
|
2050
2051
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
2051
2052
|
for (int i = 0; i < nb; ++i) {
|
|
2052
|
-
const float d =
|
|
2053
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2053
2054
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2054
2055
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2055
2056
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
@@ -2108,7 +2109,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2108
2109
|
float sumf = 0;
|
|
2109
2110
|
for (int i = 0; i < nb; i++) {
|
|
2110
2111
|
|
|
2111
|
-
const float d =
|
|
2112
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2112
2113
|
const int8_t * q8 = y[i].qs;
|
|
2113
2114
|
const uint8_t * qs = x[i].qs;
|
|
2114
2115
|
const uint8_t * qh = x[i].qh;
|
|
@@ -2168,7 +2169,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2168
2169
|
|
|
2169
2170
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
2170
2171
|
for (int i = 0; i < nb; ++i) {
|
|
2171
|
-
const float d =
|
|
2172
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2172
2173
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2173
2174
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
2174
2175
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -2213,7 +2214,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2213
2214
|
|
|
2214
2215
|
float sumf = 0.f;
|
|
2215
2216
|
for (int i = 0; i < nb; ++i) {
|
|
2216
|
-
const float d =
|
|
2217
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2217
2218
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2218
2219
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
2219
2220
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -2279,7 +2280,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2279
2280
|
|
|
2280
2281
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
2281
2282
|
for (int i = 0; i < nb; ++i) {
|
|
2282
|
-
const float d =
|
|
2283
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2283
2284
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2284
2285
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2285
2286
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
@@ -2340,7 +2341,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2340
2341
|
|
|
2341
2342
|
float sumf = 0.f;
|
|
2342
2343
|
for (int i = 0; i < nb; ++i) {
|
|
2343
|
-
const float d =
|
|
2344
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2344
2345
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2345
2346
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2346
2347
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
@@ -2451,7 +2452,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2451
2452
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
|
2452
2453
|
}
|
|
2453
2454
|
|
|
2454
|
-
const float d = y[i].d *
|
|
2455
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
2455
2456
|
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
|
|
2456
2457
|
accum1 += d * sumi1;
|
|
2457
2458
|
}
|
|
@@ -2484,7 +2485,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2484
2485
|
qs += 4;
|
|
2485
2486
|
}
|
|
2486
2487
|
|
|
2487
|
-
sumf +=
|
|
2488
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
2488
2489
|
}
|
|
2489
2490
|
|
|
2490
2491
|
*s = sumf;
|
|
@@ -2530,9 +2531,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2530
2531
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
2531
2532
|
const __m256i p_1 = lasx_madd_h(p16_1, mone);
|
|
2532
2533
|
const __m256i p_2 = lasx_madd_h(p16_2, mone);
|
|
2533
|
-
accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(
|
|
2534
|
+
accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
|
|
2534
2535
|
__lasx_xvffint_s_w(p_1), accum1);
|
|
2535
|
-
accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(
|
|
2536
|
+
accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
|
|
2536
2537
|
__lasx_xvffint_s_w(p_2), accum2);
|
|
2537
2538
|
}
|
|
2538
2539
|
|
|
@@ -2540,7 +2541,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2540
2541
|
|
|
2541
2542
|
#endif
|
|
2542
2543
|
for (; ib < nb; ++ib) {
|
|
2543
|
-
const float d =
|
|
2544
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
2544
2545
|
int sumi1 = 0, sumi2 = 0;
|
|
2545
2546
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
2546
2547
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -2595,7 +2596,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2595
2596
|
sumi1 = __lasx_xvadd_w(p_1, sumi1);
|
|
2596
2597
|
sumi2 = __lasx_xvadd_w(p_2, sumi2);
|
|
2597
2598
|
}
|
|
2598
|
-
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(
|
|
2599
|
+
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
|
2599
2600
|
__lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
|
|
2600
2601
|
}
|
|
2601
2602
|
|
|
@@ -2604,7 +2605,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2604
2605
|
#else
|
|
2605
2606
|
float sumf = 0;
|
|
2606
2607
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
2607
|
-
const float d4d8 =
|
|
2608
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
2608
2609
|
uint16_t h = x[ibl].scales_h;
|
|
2609
2610
|
const uint8_t * qs = x[ibl].qs;
|
|
2610
2611
|
const int8_t * q8 = y[ibl].qs;
|