@fugood/llama.node 1.0.0-beta.7 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +8 -0
- package/lib/index.ts +14 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +58 -8
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16(
|
|
|
108
108
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
109
109
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
110
110
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
111
|
-
dst_ptr[id] =
|
|
111
|
+
dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
|
|
112
112
|
id++;
|
|
113
113
|
}
|
|
114
114
|
}
|
|
@@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16(
|
|
|
130
130
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
131
131
|
|
|
132
132
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
133
|
-
src0_f32[i00] =
|
|
133
|
+
src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
|
|
134
134
|
}
|
|
135
135
|
|
|
136
136
|
quantize_row_q(src0_f32, dst_ptr + id, ne00);
|
|
@@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16(
|
|
|
156
156
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
157
157
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
158
158
|
|
|
159
|
-
dst_ptr[id] =
|
|
159
|
+
dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr);
|
|
160
160
|
id++;
|
|
161
161
|
}
|
|
162
162
|
}
|
|
@@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16(
|
|
|
267
267
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
268
268
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
269
269
|
|
|
270
|
-
*(float *) dst_ptr =
|
|
270
|
+
*(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
|
|
271
271
|
|
|
272
272
|
if (++i10 == ne0) {
|
|
273
273
|
i10 = 0;
|
|
@@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16(
|
|
|
372
372
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
373
373
|
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
374
374
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
375
|
-
dst_ptr[id] =
|
|
375
|
+
dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
|
|
376
376
|
id++;
|
|
377
377
|
}
|
|
378
378
|
}
|
|
@@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16(
|
|
|
473
473
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
474
474
|
const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
475
475
|
|
|
476
|
-
dst_ptr[id] =
|
|
476
|
+
dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
|
|
477
477
|
id++;
|
|
478
478
|
}
|
|
479
479
|
}
|
|
@@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16(
|
|
|
566
566
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
567
567
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
568
568
|
|
|
569
|
-
*(ggml_fp16_t *) dst_ptr =
|
|
569
|
+
*(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
|
|
570
570
|
|
|
571
571
|
if (++i10 == ne0) {
|
|
572
572
|
i10 = 0;
|
|
@@ -765,7 +765,7 @@ static void ggml_compute_forward_dup_f32(
|
|
|
765
765
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
766
766
|
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
767
767
|
|
|
768
|
-
dst_ptr[id] =
|
|
768
|
+
dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr);
|
|
769
769
|
id++;
|
|
770
770
|
}
|
|
771
771
|
}
|
|
@@ -878,7 +878,7 @@ static void ggml_compute_forward_dup_f32(
|
|
|
878
878
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
879
879
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
880
880
|
|
|
881
|
-
*(ggml_fp16_t *) dst_ptr =
|
|
881
|
+
*(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr);
|
|
882
882
|
|
|
883
883
|
if (++i10 == ne0) {
|
|
884
884
|
i10 = 0;
|
|
@@ -1419,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f32(
|
|
|
1419
1419
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
|
1420
1420
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
|
1421
1421
|
for (int i = 0; i < ne0; i++) {
|
|
1422
|
-
dst_ptr[i] =
|
|
1422
|
+
dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
|
|
1423
1423
|
}
|
|
1424
1424
|
}
|
|
1425
1425
|
}
|
|
@@ -1435,7 +1435,7 @@ static void ggml_compute_forward_add1_f16_f16(
|
|
|
1435
1435
|
GGML_ASSERT(ggml_is_scalar(src1));
|
|
1436
1436
|
|
|
1437
1437
|
// scalar to add
|
|
1438
|
-
const float v =
|
|
1438
|
+
const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
|
|
1439
1439
|
|
|
1440
1440
|
const int ith = params->ith;
|
|
1441
1441
|
const int nth = params->nth;
|
|
@@ -1467,7 +1467,7 @@ static void ggml_compute_forward_add1_f16_f16(
|
|
|
1467
1467
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
|
1468
1468
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
|
1469
1469
|
for (int i = 0; i < ne0; i++) {
|
|
1470
|
-
dst_ptr[i] =
|
|
1470
|
+
dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
|
|
1471
1471
|
}
|
|
1472
1472
|
}
|
|
1473
1473
|
}
|
|
@@ -1889,7 +1889,7 @@ static void ggml_compute_forward_sum_f16(
|
|
|
1889
1889
|
}
|
|
1890
1890
|
}
|
|
1891
1891
|
}
|
|
1892
|
-
((ggml_fp16_t *) dst->data)[0] =
|
|
1892
|
+
((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum);
|
|
1893
1893
|
}
|
|
1894
1894
|
|
|
1895
1895
|
static void ggml_compute_forward_sum_bf16(
|
|
@@ -2660,7 +2660,7 @@ static void ggml_compute_forward_gelu_f16(
|
|
|
2660
2660
|
#ifndef NDEBUG
|
|
2661
2661
|
for (int k = 0; k < nc; k++) {
|
|
2662
2662
|
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
2663
|
-
const float v =
|
|
2663
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
2664
2664
|
GGML_UNUSED(v);
|
|
2665
2665
|
assert(!isnan(v));
|
|
2666
2666
|
assert(!isinf(v));
|
|
@@ -2763,7 +2763,7 @@ static void ggml_compute_forward_gelu_erf_f16(
|
|
|
2763
2763
|
#ifndef NDEBUG
|
|
2764
2764
|
for (int k = 0; k < nc; k++) {
|
|
2765
2765
|
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
2766
|
-
const float v =
|
|
2766
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
2767
2767
|
GGML_UNUSED(v);
|
|
2768
2768
|
assert(!isnan(v));
|
|
2769
2769
|
assert(!isinf(v));
|
|
@@ -2866,7 +2866,7 @@ static void ggml_compute_forward_gelu_quick_f16(
|
|
|
2866
2866
|
#ifndef NDEBUG
|
|
2867
2867
|
for (int k = 0; k < nc; k++) {
|
|
2868
2868
|
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
2869
|
-
const float v =
|
|
2869
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
2870
2870
|
GGML_UNUSED(v);
|
|
2871
2871
|
assert(!isnan(v));
|
|
2872
2872
|
assert(!isinf(v));
|
|
@@ -2969,7 +2969,7 @@ static void ggml_compute_forward_silu_f16(
|
|
|
2969
2969
|
#ifndef NDEBUG
|
|
2970
2970
|
for (int k = 0; k < nc; k++) {
|
|
2971
2971
|
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
|
2972
|
-
const float v =
|
|
2972
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
2973
2973
|
GGML_UNUSED(v);
|
|
2974
2974
|
assert(!isnan(v));
|
|
2975
2975
|
assert(!isinf(v));
|
|
@@ -3163,7 +3163,7 @@ static void ggml_compute_forward_silu_back_f16(
|
|
|
3163
3163
|
#ifndef NDEBUG
|
|
3164
3164
|
for (int k = 0; k < nc; k++) {
|
|
3165
3165
|
const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
3166
|
-
const float v =
|
|
3166
|
+
const float v = GGML_CPU_FP16_TO_FP32(x);
|
|
3167
3167
|
GGML_UNUSED(v);
|
|
3168
3168
|
assert(!isnan(v));
|
|
3169
3169
|
assert(!isinf(v));
|
|
@@ -4500,7 +4500,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
|
|
|
4500
4500
|
|
|
4501
4501
|
for (int j = 0; j < nc; ++j) {
|
|
4502
4502
|
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
|
|
4503
|
-
((float *) ((char *) dst->data + r*dst->nb[1]))[j] +=
|
|
4503
|
+
((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v);
|
|
4504
4504
|
}
|
|
4505
4505
|
}
|
|
4506
4506
|
}
|
|
@@ -4792,7 +4792,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
|
4792
4792
|
if (mp_f32) {
|
|
4793
4793
|
if (use_f16) {
|
|
4794
4794
|
for (int i = 0; i < nc; ++i) {
|
|
4795
|
-
wp[i] += slope*
|
|
4795
|
+
wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
|
|
4796
4796
|
}
|
|
4797
4797
|
} else {
|
|
4798
4798
|
for (int i = 0; i < nc; ++i) {
|
|
@@ -5018,8 +5018,8 @@ static void ggml_compute_forward_clamp_f16(
|
|
|
5018
5018
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
|
5019
5019
|
|
|
5020
5020
|
for (int i = 0; i < nc; i++) {
|
|
5021
|
-
float v =
|
|
5022
|
-
dst_ptr[i] =
|
|
5021
|
+
float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]);
|
|
5022
|
+
dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min));
|
|
5023
5023
|
}
|
|
5024
5024
|
}
|
|
5025
5025
|
}
|
|
@@ -5476,11 +5476,11 @@ static void ggml_compute_forward_rope_f16(
|
|
|
5476
5476
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
5477
5477
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
5478
5478
|
|
|
5479
|
-
const float x0 =
|
|
5480
|
-
const float x1 =
|
|
5479
|
+
const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
|
|
5480
|
+
const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
|
|
5481
5481
|
|
|
5482
|
-
dst_data[0] =
|
|
5483
|
-
dst_data[n_dims] =
|
|
5482
|
+
dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
5483
|
+
dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
5484
5484
|
}
|
|
5485
5485
|
} else {
|
|
5486
5486
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
@@ -5492,11 +5492,11 @@ static void ggml_compute_forward_rope_f16(
|
|
|
5492
5492
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
5493
5493
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
5494
5494
|
|
|
5495
|
-
const float x0 =
|
|
5496
|
-
const float x1 =
|
|
5495
|
+
const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
|
|
5496
|
+
const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
|
|
5497
5497
|
|
|
5498
|
-
dst_data[0] =
|
|
5499
|
-
dst_data[n_dims/2] =
|
|
5498
|
+
dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
5499
|
+
dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
5500
5500
|
}
|
|
5501
5501
|
}
|
|
5502
5502
|
} else {
|
|
@@ -5507,11 +5507,11 @@ static void ggml_compute_forward_rope_f16(
|
|
|
5507
5507
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
5508
5508
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
5509
5509
|
|
|
5510
|
-
const float x0 =
|
|
5511
|
-
const float x1 =
|
|
5510
|
+
const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
|
|
5511
|
+
const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
|
|
5512
5512
|
|
|
5513
|
-
dst_data[0] =
|
|
5514
|
-
dst_data[1] =
|
|
5513
|
+
dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
5514
|
+
dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
5515
5515
|
}
|
|
5516
5516
|
}
|
|
5517
5517
|
|
|
@@ -5525,11 +5525,11 @@ static void ggml_compute_forward_rope_f16(
|
|
|
5525
5525
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
5526
5526
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
5527
5527
|
|
|
5528
|
-
const float x0 =
|
|
5529
|
-
const float x1 =
|
|
5528
|
+
const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
|
|
5529
|
+
const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
|
|
5530
5530
|
|
|
5531
|
-
dst_data[0] =
|
|
5532
|
-
dst_data[n_dims] =
|
|
5531
|
+
dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
5532
|
+
dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
5533
5533
|
}
|
|
5534
5534
|
} else {
|
|
5535
5535
|
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
|
@@ -5640,7 +5640,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|
|
5640
5640
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
|
5641
5641
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
|
5642
5642
|
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
|
5643
|
-
dst_data[i10*ne11 + i11] =
|
|
5643
|
+
dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]);
|
|
5644
5644
|
}
|
|
5645
5645
|
}
|
|
5646
5646
|
}
|
|
@@ -5933,7 +5933,7 @@ static void ggml_compute_forward_im2col_f16(
|
|
|
5933
5933
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
|
5934
5934
|
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
|
5935
5935
|
} else {
|
|
5936
|
-
dst_data[iic*(KH*KW) + ikh*KW + ikw] =
|
|
5936
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
|
5937
5937
|
}
|
|
5938
5938
|
}
|
|
5939
5939
|
}
|
|
@@ -6109,7 +6109,7 @@ void ggml_compute_forward_conv_transpose_2d(
|
|
|
6109
6109
|
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
|
6110
6110
|
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
|
6111
6111
|
for (int i10 = 0; i10 < ne10; i10++) {
|
|
6112
|
-
dst_data[i10*ne12 + i12] =
|
|
6112
|
+
dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
|
|
6113
6113
|
}
|
|
6114
6114
|
}
|
|
6115
6115
|
}
|
|
@@ -6358,7 +6358,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
|
|
|
6358
6358
|
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
6359
6359
|
}
|
|
6360
6360
|
for (int ki = 0; ki < k; ++ki) {
|
|
6361
|
-
const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] :
|
|
6361
|
+
const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
|
|
6362
6362
|
switch (op) {
|
|
6363
6363
|
case GGML_OP_POOL_AVG: drow[i] += srow_j; break;
|
|
6364
6364
|
case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break;
|
|
@@ -6450,7 +6450,7 @@ void ggml_compute_forward_pool_2d(
|
|
|
6450
6450
|
for (int kx = 0; kx < k0; ++kx) {
|
|
6451
6451
|
int j = ix + kx;
|
|
6452
6452
|
if (j < 0 || j >= src->ne[0]) continue;
|
|
6453
|
-
const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] :
|
|
6453
|
+
const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
|
|
6454
6454
|
switch (op) {
|
|
6455
6455
|
case GGML_OP_POOL_AVG: *out += srow_j; break;
|
|
6456
6456
|
case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break;
|
|
@@ -6538,7 +6538,7 @@ void ggml_compute_forward_pool_2d_back(
|
|
|
6538
6538
|
}
|
|
6539
6539
|
|
|
6540
6540
|
const float val = dst->type == GGML_TYPE_F32 ?
|
|
6541
|
-
((const float *) drowf)[j] :
|
|
6541
|
+
((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
|
|
6542
6542
|
if (val <= maxval) {
|
|
6543
6543
|
continue;
|
|
6544
6544
|
}
|
|
@@ -6558,7 +6558,7 @@ void ggml_compute_forward_pool_2d_back(
|
|
|
6558
6558
|
if (dst->type == GGML_TYPE_F32) {
|
|
6559
6559
|
((float *) drow)[j] += grad0;
|
|
6560
6560
|
} else {
|
|
6561
|
-
((ggml_fp16_t *) drow)[j] =
|
|
6561
|
+
((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
|
|
6562
6562
|
}
|
|
6563
6563
|
} else if (op == GGML_OP_POOL_AVG) {
|
|
6564
6564
|
const float grad = grad0 / ka;
|
|
@@ -6577,7 +6577,7 @@ void ggml_compute_forward_pool_2d_back(
|
|
|
6577
6577
|
if (dst->type == GGML_TYPE_F32) {
|
|
6578
6578
|
((float *) drow)[j] += grad;
|
|
6579
6579
|
} else {
|
|
6580
|
-
((ggml_fp16_t *) drow)[j] +=
|
|
6580
|
+
((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad);
|
|
6581
6581
|
}
|
|
6582
6582
|
}
|
|
6583
6583
|
}
|
|
@@ -7142,7 +7142,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
7142
7142
|
// loop over n_kv and n_head_kv
|
|
7143
7143
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
|
7144
7144
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
|
7145
|
-
const float mv = mp ? slope*
|
|
7145
|
+
const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
|
|
7146
7146
|
if (mv == -INFINITY) {
|
|
7147
7147
|
continue;
|
|
7148
7148
|
}
|
|
@@ -7210,7 +7210,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
7210
7210
|
|
|
7211
7211
|
if (v->type == GGML_TYPE_F16) {
|
|
7212
7212
|
for (int64_t d = 0; d < DV; ++d) {
|
|
7213
|
-
VKQ32[d] =
|
|
7213
|
+
VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]);
|
|
7214
7214
|
}
|
|
7215
7215
|
}
|
|
7216
7216
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "ggml-common.h"
|
|
3
3
|
|
|
4
4
|
#include "ggml-cpu-impl.h"
|
|
5
|
+
#include "simd-mappings.h"
|
|
5
6
|
#include "ggml-quants.h"
|
|
6
7
|
#include "quants.h"
|
|
7
8
|
|
|
@@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
137
138
|
}
|
|
138
139
|
|
|
139
140
|
int sumi = sumi0 + sumi1;
|
|
140
|
-
sumf += sumi*
|
|
141
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
141
142
|
}
|
|
142
143
|
|
|
143
144
|
*s = sumf;
|
|
@@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
174
175
|
}
|
|
175
176
|
|
|
176
177
|
int sumi = sumi0 + sumi1;
|
|
177
|
-
sumf += (
|
|
178
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
178
179
|
}
|
|
179
180
|
|
|
180
181
|
*s = sumf;
|
|
@@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
217
218
|
}
|
|
218
219
|
|
|
219
220
|
int sumi = sumi0 + sumi1;
|
|
220
|
-
sumf += (
|
|
221
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
221
222
|
}
|
|
222
223
|
|
|
223
224
|
*s = sumf;
|
|
@@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
260
261
|
}
|
|
261
262
|
|
|
262
263
|
int sumi = sumi0 + sumi1;
|
|
263
|
-
sumf += (
|
|
264
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
264
265
|
}
|
|
265
266
|
|
|
266
267
|
*s = sumf;
|
|
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
290
291
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
291
292
|
}
|
|
292
293
|
|
|
293
|
-
sumf += sumi*(
|
|
294
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
294
295
|
}
|
|
295
296
|
|
|
296
297
|
*s = sumf;
|
|
@@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
342
343
|
}
|
|
343
344
|
}
|
|
344
345
|
|
|
345
|
-
sumf += (float) sum * (
|
|
346
|
+
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
346
347
|
}
|
|
347
348
|
|
|
348
349
|
*s = sumf;
|
|
@@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
372
373
|
}
|
|
373
374
|
}
|
|
374
375
|
|
|
375
|
-
const float d = y[i].d *
|
|
376
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
376
377
|
|
|
377
378
|
sumf += (float) sumi * d;
|
|
378
379
|
}
|
|
@@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
405
406
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
406
407
|
}
|
|
407
408
|
|
|
408
|
-
const float dall = y[i].d *
|
|
409
|
-
const float dmin = y[i].d *
|
|
409
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
410
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
410
411
|
|
|
411
412
|
int isum = 0;
|
|
412
413
|
int is = 0;
|
|
@@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
504
505
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
505
506
|
q8 += 8; a += 8;
|
|
506
507
|
}
|
|
507
|
-
const float d =
|
|
508
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
508
509
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
509
510
|
}
|
|
510
511
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
577
578
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
578
579
|
q8 += 8; a += 8;
|
|
579
580
|
}
|
|
580
|
-
const float d =
|
|
581
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
581
582
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
582
|
-
const float dmin =
|
|
583
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
583
584
|
sumf -= dmin * sumi;
|
|
584
585
|
}
|
|
585
586
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
657
658
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
658
659
|
q8 += 8; a += 8;
|
|
659
660
|
}
|
|
660
|
-
const float d =
|
|
661
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
661
662
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
662
|
-
const float dmin =
|
|
663
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
663
664
|
sumf -= dmin * sumi;
|
|
664
665
|
}
|
|
665
666
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
714
715
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
715
716
|
q8 += 8; a += 8;
|
|
716
717
|
}
|
|
717
|
-
const float d =
|
|
718
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
718
719
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
719
720
|
}
|
|
720
721
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
739
740
|
|
|
740
741
|
float sumf = 0.f;
|
|
741
742
|
for (int i = 0; i < nb; ++i) {
|
|
742
|
-
const float d =
|
|
743
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
743
744
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
744
745
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
745
746
|
int32_t bsum = 0;
|
|
@@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
778
779
|
|
|
779
780
|
float sumf = 0.f;
|
|
780
781
|
for (int i = 0; i < nb; ++i) {
|
|
781
|
-
const float d =
|
|
782
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
782
783
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
783
784
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
784
785
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
829
830
|
float sumf = 0;
|
|
830
831
|
for (int i = 0; i < nb; i++) {
|
|
831
832
|
|
|
832
|
-
const float d =
|
|
833
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
833
834
|
const int8_t * q8 = y[i].qs;
|
|
834
835
|
const uint8_t * qs = x[i].qs;
|
|
835
836
|
const uint8_t * qh = x[i].qh;
|
|
@@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
882
883
|
|
|
883
884
|
float sumf = 0.f;
|
|
884
885
|
for (int i = 0; i < nb; ++i) {
|
|
885
|
-
const float d =
|
|
886
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
886
887
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
887
888
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
888
889
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
924
925
|
|
|
925
926
|
float sumf = 0.f;
|
|
926
927
|
for (int i = 0; i < nb; ++i) {
|
|
927
|
-
const float d =
|
|
928
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
928
929
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
929
930
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
930
931
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
@@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1002
1003
|
qs += 4;
|
|
1003
1004
|
}
|
|
1004
1005
|
|
|
1005
|
-
sumf +=
|
|
1006
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
1006
1007
|
}
|
|
1007
1008
|
|
|
1008
1009
|
*s = sumf;
|
|
@@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1063
1064
|
qh += 2;
|
|
1064
1065
|
}
|
|
1065
1066
|
|
|
1066
|
-
sumf +=
|
|
1067
|
+
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
1067
1068
|
}
|
|
1068
1069
|
|
|
1069
1070
|
*s = sumf;
|
|
@@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1087
1088
|
float sumf = 0;
|
|
1088
1089
|
|
|
1089
1090
|
for (; ib < nb; ++ib) {
|
|
1090
|
-
const float d =
|
|
1091
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
1091
1092
|
int sumi1 = 0, sumi2 = 0;
|
|
1092
1093
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
1093
1094
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1113
1114
|
|
|
1114
1115
|
float sumf = 0;
|
|
1115
1116
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
1116
|
-
const float d4d8 =
|
|
1117
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
1117
1118
|
uint16_t h = x[ibl].scales_h;
|
|
1118
1119
|
const uint8_t * qs = x[ibl].qs;
|
|
1119
1120
|
const int8_t * q8 = y[ibl].qs;
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include "arch-fallback.h"
|
|
@@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
72
73
|
const float d = amax / ((1 << 7) - 1);
|
|
73
74
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
74
75
|
|
|
75
|
-
y[i].d[row_iter] =
|
|
76
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
76
77
|
}
|
|
77
78
|
|
|
78
79
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
110
111
|
const float d = amax / ((1 << 7) - 1);
|
|
111
112
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
112
113
|
|
|
113
|
-
y[i].d[row_iter] =
|
|
114
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
236
237
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
237
238
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
238
239
|
}
|
|
239
|
-
sumf[j] += sumi *
|
|
240
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
240
241
|
}
|
|
241
242
|
}
|
|
242
243
|
}
|
|
@@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
280
281
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
281
282
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
282
283
|
}
|
|
283
|
-
sumf[j] += sumi *
|
|
284
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
284
285
|
}
|
|
285
286
|
}
|
|
286
287
|
}
|
|
@@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
325
326
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
326
327
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
327
328
|
}
|
|
328
|
-
sumf[j] += sumi *
|
|
329
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
329
330
|
}
|
|
330
331
|
}
|
|
331
332
|
}
|
|
@@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
396
397
|
sumi2 = sumi2 * scales_1[j];
|
|
397
398
|
sumi += sumi1 + sumi2;
|
|
398
399
|
}
|
|
399
|
-
sumf[j] += sumi *
|
|
400
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
400
401
|
}
|
|
401
402
|
}
|
|
402
403
|
for (int sb = 0; sb < 8; sb++) {
|
|
403
404
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
404
405
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
405
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
406
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
406
407
|
}
|
|
407
408
|
}
|
|
408
409
|
}
|
|
@@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
449
450
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
450
451
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
451
452
|
}
|
|
452
|
-
sumf[j] += sumi *
|
|
453
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
453
454
|
}
|
|
454
455
|
}
|
|
455
456
|
}
|
|
@@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
500
501
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
501
502
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
502
503
|
}
|
|
503
|
-
sumf[m][j] += sumi *
|
|
504
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
504
505
|
}
|
|
505
506
|
}
|
|
506
507
|
}
|
|
@@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
555
556
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
556
557
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
557
558
|
}
|
|
558
|
-
sumf[m][j] += sumi *
|
|
559
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
559
560
|
}
|
|
560
561
|
}
|
|
561
562
|
}
|
|
@@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
609
610
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
610
611
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
611
612
|
}
|
|
612
|
-
sumf[m][j] += sumi *
|
|
613
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
613
614
|
}
|
|
614
615
|
}
|
|
615
616
|
}
|
|
@@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
688
689
|
sumi2 = sumi2 * scales_1[j];
|
|
689
690
|
sumi += sumi1 + sumi2;
|
|
690
691
|
}
|
|
691
|
-
sumf[m][j] += sumi *
|
|
692
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
692
693
|
}
|
|
693
694
|
}
|
|
694
695
|
}
|
|
@@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
697
698
|
for(int m = 0; m < 4; m++) {
|
|
698
699
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
699
700
|
for(int j = 0; j < ncols_interleaved; j++) {
|
|
700
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
701
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
701
702
|
}
|
|
702
703
|
}
|
|
703
704
|
}
|
|
@@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
753
754
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
754
755
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
755
756
|
}
|
|
756
|
-
sumf[m][j] += sumi *
|
|
757
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
757
758
|
}
|
|
758
759
|
}
|
|
759
760
|
}
|