@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -719,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
|
|
|
719
719
|
}
|
|
720
720
|
#endif //__loongarch_asx
|
|
721
721
|
|
|
722
|
-
void quantize_row_q4_0(const float *
|
|
722
|
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
723
723
|
quantize_row_q4_0_ref(x, y, k);
|
|
724
724
|
}
|
|
725
725
|
|
|
726
|
-
void quantize_row_q4_1(const float *
|
|
726
|
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
727
727
|
quantize_row_q4_1_ref(x, y, k);
|
|
728
728
|
}
|
|
729
729
|
|
|
730
|
-
void quantize_row_q5_0(const float *
|
|
730
|
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
731
731
|
quantize_row_q5_0_ref(x, y, k);
|
|
732
732
|
}
|
|
733
733
|
|
|
734
|
-
void quantize_row_q5_1(const float *
|
|
734
|
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
735
735
|
quantize_row_q5_1_ref(x, y, k);
|
|
736
736
|
}
|
|
737
737
|
|
|
738
|
-
void quantize_row_q8_0(const float *
|
|
738
|
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
739
739
|
assert(QK8_0 == 32);
|
|
740
740
|
assert(k % QK8_0 == 0);
|
|
741
741
|
const int nb = k / QK8_0;
|
|
742
742
|
|
|
743
|
-
block_q8_0 *
|
|
743
|
+
block_q8_0 * GGML_RESTRICT y = vy;
|
|
744
744
|
|
|
745
745
|
#if defined(__ARM_NEON)
|
|
746
746
|
for (int i = 0; i < nb; i++) {
|
|
@@ -1011,6 +1011,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1011
1011
|
__lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
|
|
1012
1012
|
|
|
1013
1013
|
}
|
|
1014
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
1015
|
+
for (int i = 0; i < nb; i++) {
|
|
1016
|
+
__vector float srcv [8];
|
|
1017
|
+
__vector float asrcv[8];
|
|
1018
|
+
__vector float amaxv[8];
|
|
1019
|
+
|
|
1020
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
1021
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
1022
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
|
1023
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
|
1024
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
|
1025
|
+
|
|
1026
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
|
1027
|
+
vec_extract(amaxv[0], 1)),
|
|
1028
|
+
MAX(vec_extract(amaxv[0], 2),
|
|
1029
|
+
vec_extract(amaxv[0], 3)));
|
|
1030
|
+
|
|
1031
|
+
const float d = amax / ((1 << 7) - 1);
|
|
1032
|
+
const float id = d ? 1.0f / d : 0.0f;
|
|
1033
|
+
|
|
1034
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
|
1035
|
+
|
|
1036
|
+
for (int j = 0; j < 8; j++) {
|
|
1037
|
+
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
|
1038
|
+
const __vector int32_t vi = vec_signed(v);
|
|
1039
|
+
|
|
1040
|
+
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
1041
|
+
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
1042
|
+
y[i].qs[4*j + 2] = vec_extract(vi, 2);
|
|
1043
|
+
y[i].qs[4*j + 3] = vec_extract(vi, 3);
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1014
1046
|
#else
|
|
1015
1047
|
GGML_UNUSED(nb);
|
|
1016
1048
|
// scalar
|
|
@@ -1018,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1018
1050
|
#endif
|
|
1019
1051
|
}
|
|
1020
1052
|
|
|
1021
|
-
void quantize_row_q8_1(const float *
|
|
1053
|
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1022
1054
|
assert(k % QK8_1 == 0);
|
|
1023
1055
|
const int nb = k / QK8_1;
|
|
1024
1056
|
|
|
1025
|
-
block_q8_1 *
|
|
1057
|
+
block_q8_1 * GGML_RESTRICT y = vy;
|
|
1026
1058
|
|
|
1027
1059
|
#if defined(__ARM_NEON)
|
|
1028
1060
|
for (int i = 0; i < nb; i++) {
|
|
@@ -1337,6 +1369,44 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1337
1369
|
__lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
|
|
1338
1370
|
__lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
|
|
1339
1371
|
}
|
|
1372
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
1373
|
+
for (int i = 0; i < nb; i++) {
|
|
1374
|
+
__vector float srcv [8];
|
|
1375
|
+
__vector float asrcv[8];
|
|
1376
|
+
__vector float amaxv[8];
|
|
1377
|
+
|
|
1378
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
1379
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
1380
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
|
1381
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
|
1382
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
|
1383
|
+
|
|
1384
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
|
1385
|
+
vec_extract(amaxv[0], 1)),
|
|
1386
|
+
MAX(vec_extract(amaxv[0], 2),
|
|
1387
|
+
vec_extract(amaxv[0], 3)));
|
|
1388
|
+
|
|
1389
|
+
const float d = amax / ((1 << 7) - 1);
|
|
1390
|
+
const float id = d ? 1.0f / d : 0.0f;
|
|
1391
|
+
|
|
1392
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
|
1393
|
+
|
|
1394
|
+
__vector int32_t acc = vec_splats(0);
|
|
1395
|
+
|
|
1396
|
+
for (int j = 0; j < 8; j++) {
|
|
1397
|
+
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
|
1398
|
+
const __vector int32_t vi = vec_signed(v);
|
|
1399
|
+
|
|
1400
|
+
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
1401
|
+
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
1402
|
+
y[i].qs[4*j + 2] = vec_extract(vi, 2);
|
|
1403
|
+
y[i].qs[4*j + 3] = vec_extract(vi, 3);
|
|
1404
|
+
|
|
1405
|
+
acc = vec_add(acc, vi);
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
|
|
1409
|
+
}
|
|
1340
1410
|
#else
|
|
1341
1411
|
GGML_UNUSED(nb);
|
|
1342
1412
|
// scalar
|
|
@@ -1358,8 +1428,8 @@ static inline int nearest_int(float fval) {
|
|
|
1358
1428
|
return (i & 0x007fffff) - 0x00400000;
|
|
1359
1429
|
}
|
|
1360
1430
|
|
|
1361
|
-
static float make_qx_quants(int n, int nmax, const float *
|
|
1362
|
-
const float *
|
|
1431
|
+
static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
|
|
1432
|
+
const float * GGML_RESTRICT qw) {
|
|
1363
1433
|
float max = 0;
|
|
1364
1434
|
float amax = 0;
|
|
1365
1435
|
for (int i = 0; i < n; ++i) {
|
|
@@ -1427,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
1427
1497
|
return scale;
|
|
1428
1498
|
}
|
|
1429
1499
|
|
|
1430
|
-
static float make_q3_quants(int n, int nmax, const float *
|
|
1500
|
+
static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
|
|
1431
1501
|
float max = 0;
|
|
1432
1502
|
float amax = 0;
|
|
1433
1503
|
for (int i = 0; i < n; ++i) {
|
|
@@ -1486,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
1486
1556
|
return 1/iscale;
|
|
1487
1557
|
}
|
|
1488
1558
|
|
|
1489
|
-
static float make_qkx1_quants(int n, int nmax, const float *
|
|
1559
|
+
static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
|
|
1490
1560
|
int ntry, float alpha) {
|
|
1491
1561
|
float min = x[0];
|
|
1492
1562
|
float max = x[0];
|
|
@@ -1529,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
|
1529
1599
|
return scale;
|
|
1530
1600
|
}
|
|
1531
1601
|
|
|
1532
|
-
static float make_qkx2_quants(int n, int nmax, const float *
|
|
1533
|
-
uint8_t *
|
|
1602
|
+
static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
|
1603
|
+
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
|
1534
1604
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
1535
1605
|
float min = x[0];
|
|
1536
1606
|
float max = x[0];
|
|
@@ -1610,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|
|
1610
1680
|
return scale;
|
|
1611
1681
|
}
|
|
1612
1682
|
|
|
1613
|
-
static inline void get_scale_min_k4(int j, const uint8_t *
|
|
1683
|
+
static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
|
|
1614
1684
|
if (j < 4) {
|
|
1615
1685
|
*d = q[j] & 63; *m = q[j + 4] & 63;
|
|
1616
1686
|
} else {
|
|
@@ -1621,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
|
1621
1691
|
|
|
1622
1692
|
//========================- 2-bit (de)-quantization
|
|
1623
1693
|
|
|
1624
|
-
void quantize_row_q2_K(const float *
|
|
1694
|
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1625
1695
|
quantize_row_q2_K_ref(x, vy, k);
|
|
1626
1696
|
}
|
|
1627
1697
|
|
|
1628
1698
|
//========================= 3-bit (de)-quantization
|
|
1629
1699
|
|
|
1630
|
-
void quantize_row_q3_K(const float *
|
|
1700
|
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1631
1701
|
quantize_row_q3_K_ref(x, vy, k);
|
|
1632
1702
|
}
|
|
1633
1703
|
|
|
1634
1704
|
// ====================== 4-bit (de)-quantization
|
|
1635
1705
|
|
|
1636
|
-
void quantize_row_q4_K(const float *
|
|
1706
|
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1637
1707
|
assert(k % QK_K == 0);
|
|
1638
|
-
block_q4_K *
|
|
1708
|
+
block_q4_K * GGML_RESTRICT y = vy;
|
|
1639
1709
|
quantize_row_q4_K_ref(x, y, k);
|
|
1640
1710
|
}
|
|
1641
1711
|
|
|
1642
1712
|
// ====================== 5-bit (de)-quantization
|
|
1643
1713
|
|
|
1644
|
-
void quantize_row_q5_K(const float *
|
|
1714
|
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1645
1715
|
assert(k % QK_K == 0);
|
|
1646
|
-
block_q5_K *
|
|
1716
|
+
block_q5_K * GGML_RESTRICT y = vy;
|
|
1647
1717
|
quantize_row_q5_K_ref(x, y, k);
|
|
1648
1718
|
}
|
|
1649
1719
|
|
|
1650
1720
|
// ====================== 6-bit (de)-quantization
|
|
1651
1721
|
|
|
1652
|
-
void quantize_row_q6_K(const float *
|
|
1722
|
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1653
1723
|
assert(k % QK_K == 0);
|
|
1654
|
-
block_q6_K *
|
|
1724
|
+
block_q6_K * GGML_RESTRICT y = vy;
|
|
1655
1725
|
quantize_row_q6_K_ref(x, y, k);
|
|
1656
1726
|
}
|
|
1657
1727
|
|
|
1658
1728
|
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
|
1659
1729
|
|
|
1660
|
-
void quantize_row_tq1_0(const float *
|
|
1730
|
+
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1661
1731
|
assert(k % QK_K == 0);
|
|
1662
|
-
block_tq1_0 *
|
|
1732
|
+
block_tq1_0 * GGML_RESTRICT y = vy;
|
|
1663
1733
|
quantize_row_tq1_0_ref(x, y, k);
|
|
1664
1734
|
}
|
|
1665
1735
|
|
|
1666
|
-
void quantize_row_tq2_0(const float *
|
|
1736
|
+
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1667
1737
|
assert(k % QK_K == 0);
|
|
1668
|
-
block_tq2_0 *
|
|
1738
|
+
block_tq2_0 * GGML_RESTRICT y = vy;
|
|
1669
1739
|
quantize_row_tq2_0_ref(x, y, k);
|
|
1670
1740
|
}
|
|
1671
1741
|
|
|
@@ -1673,11 +1743,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
|
|
|
1673
1743
|
|
|
1674
1744
|
//===================================== Q8_K ==============================================
|
|
1675
1745
|
|
|
1676
|
-
void quantize_row_q8_K(const float *
|
|
1746
|
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
1677
1747
|
#ifdef __wasm_simd128__
|
|
1678
1748
|
assert(k % QK_K == 0);
|
|
1679
1749
|
const int64_t nb = k / QK_K;
|
|
1680
|
-
block_q8_K *
|
|
1750
|
+
block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
|
|
1681
1751
|
|
|
1682
1752
|
for (int i = 0; i < nb; i++) {
|
|
1683
1753
|
const float * x_block = x + i * QK_K;
|
|
@@ -1839,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
|
|
|
1839
1909
|
}
|
|
1840
1910
|
#endif
|
|
1841
1911
|
|
|
1842
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float *
|
|
1912
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
1843
1913
|
const int qk = QK8_0;
|
|
1844
1914
|
const int nb = n / qk;
|
|
1845
1915
|
|
|
@@ -1854,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1854
1924
|
UNUSED(by);
|
|
1855
1925
|
UNUSED(bs);
|
|
1856
1926
|
|
|
1857
|
-
const block_q4_0 *
|
|
1858
|
-
const block_q8_0 *
|
|
1927
|
+
const block_q4_0 * GGML_RESTRICT x = vx;
|
|
1928
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
1859
1929
|
|
|
1860
1930
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
1861
1931
|
if (nrc == 2) {
|
|
1862
|
-
const block_q4_0 *
|
|
1863
|
-
const block_q4_0 *
|
|
1864
|
-
const block_q8_0 *
|
|
1865
|
-
const block_q8_0 *
|
|
1932
|
+
const block_q4_0 * GGML_RESTRICT vx0 = vx;
|
|
1933
|
+
const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
|
1934
|
+
const block_q8_0 * GGML_RESTRICT vy0 = vy;
|
|
1935
|
+
const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
|
1866
1936
|
|
|
1867
1937
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
1868
1938
|
|
|
1869
1939
|
for (int i = 0; i < nb; i++) {
|
|
1870
|
-
const block_q4_0 *
|
|
1871
|
-
const block_q4_0 *
|
|
1872
|
-
const block_q8_0 *
|
|
1873
|
-
const block_q8_0 *
|
|
1940
|
+
const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
|
|
1941
|
+
const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
|
|
1942
|
+
const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
|
|
1943
|
+
const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
1874
1944
|
|
|
1875
1945
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
1876
1946
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
|
@@ -1947,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1947
2017
|
const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
|
|
1948
2018
|
|
|
1949
2019
|
for (; ib + 1 < nb; ib += 2) {
|
|
1950
|
-
const block_q4_0 *
|
|
1951
|
-
const block_q4_0 *
|
|
1952
|
-
const block_q8_0 *
|
|
1953
|
-
const block_q8_0 *
|
|
2020
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2021
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2022
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2023
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
1954
2024
|
|
|
1955
2025
|
// load x
|
|
1956
2026
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
|
@@ -1993,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1993
2063
|
const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
|
|
1994
2064
|
|
|
1995
2065
|
for (; ib + 1 < nb; ib += 2) {
|
|
1996
|
-
const block_q4_0 *
|
|
1997
|
-
const block_q4_0 *
|
|
1998
|
-
const block_q8_0 *
|
|
1999
|
-
const block_q8_0 *
|
|
2066
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2067
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2068
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2069
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2000
2070
|
|
|
2001
2071
|
// load x
|
|
2002
2072
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
|
@@ -2034,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2034
2104
|
const svbool_t pl16 = svnot_b_z(ph32, ph16);
|
|
2035
2105
|
|
|
2036
2106
|
for (; ib + 1 < nb; ib += 2) {
|
|
2037
|
-
const block_q4_0 *
|
|
2038
|
-
const block_q4_0 *
|
|
2039
|
-
const block_q8_0 *
|
|
2040
|
-
const block_q8_0 *
|
|
2107
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2108
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2109
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2110
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2041
2111
|
|
|
2042
2112
|
// load x
|
|
2043
2113
|
const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
|
|
@@ -2074,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2074
2144
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
|
2075
2145
|
|
|
2076
2146
|
for (; ib + 1 < nb; ib += 2) {
|
|
2077
|
-
const block_q4_0 *
|
|
2078
|
-
const block_q4_0 *
|
|
2079
|
-
const block_q8_0 *
|
|
2080
|
-
const block_q8_0 *
|
|
2147
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2148
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2149
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2150
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2081
2151
|
|
|
2082
2152
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
2083
2153
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
|
@@ -2119,10 +2189,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2119
2189
|
const v128_t s8b = wasm_i8x16_splat(0x8);
|
|
2120
2190
|
|
|
2121
2191
|
for (; ib + 1 < nb; ib += 2) {
|
|
2122
|
-
const block_q4_0 *
|
|
2123
|
-
const block_q4_0 *
|
|
2124
|
-
const block_q8_0 *
|
|
2125
|
-
const block_q8_0 *
|
|
2192
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
|
|
2193
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2194
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
2195
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2126
2196
|
|
|
2127
2197
|
// Load and process x0
|
|
2128
2198
|
v128_t v0_0 = wasm_v128_load(x0->qs);
|
|
@@ -2488,6 +2558,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2488
2558
|
}
|
|
2489
2559
|
|
|
2490
2560
|
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
|
2561
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
2562
|
+
__vector float acc = vec_splats(0.0f);
|
|
2563
|
+
|
|
2564
|
+
const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
|
|
2565
|
+
const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
|
|
2566
|
+
|
|
2567
|
+
for (; ib < nb; ++ib) {
|
|
2568
|
+
const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
|
|
2569
|
+
const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
|
|
2570
|
+
const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
|
|
2571
|
+
|
|
2572
|
+
const __vector int8_t v_xls = vec_sub(v_xl, v_s);
|
|
2573
|
+
const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
|
|
2574
|
+
|
|
2575
|
+
const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
|
|
2576
|
+
const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
|
2577
|
+
|
|
2578
|
+
const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
|
|
2579
|
+
const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
|
|
2580
|
+
const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
|
2581
|
+
const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
|
|
2582
|
+
|
|
2583
|
+
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
|
2584
|
+
|
|
2585
|
+
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
|
|
2586
|
+
const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
2587
|
+
|
|
2588
|
+
acc = vec_madd(v_xy, v_d, acc);
|
|
2589
|
+
}
|
|
2590
|
+
|
|
2591
|
+
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
|
2491
2592
|
#endif
|
|
2492
2593
|
for (; ib < nb; ++ib) {
|
|
2493
2594
|
int sumi0 = 0;
|
|
@@ -2508,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2508
2609
|
*s = sumf;
|
|
2509
2610
|
}
|
|
2510
2611
|
|
|
2511
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float *
|
|
2612
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
2512
2613
|
const int qk = QK8_1;
|
|
2513
2614
|
const int nb = n / qk;
|
|
2514
2615
|
|
|
@@ -2523,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2523
2624
|
UNUSED(by);
|
|
2524
2625
|
UNUSED(bs);
|
|
2525
2626
|
|
|
2526
|
-
const block_q4_1 *
|
|
2527
|
-
const block_q8_1 *
|
|
2627
|
+
const block_q4_1 * GGML_RESTRICT x = vx;
|
|
2628
|
+
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
2528
2629
|
|
|
2529
2630
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
2530
2631
|
if (nrc == 2) {
|
|
2531
|
-
const block_q4_1 *
|
|
2532
|
-
const block_q4_1 *
|
|
2533
|
-
const block_q8_1 *
|
|
2534
|
-
const block_q8_1 *
|
|
2632
|
+
const block_q4_1 * GGML_RESTRICT vx0 = vx;
|
|
2633
|
+
const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
|
2634
|
+
const block_q8_1 * GGML_RESTRICT vy0 = vy;
|
|
2635
|
+
const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
|
2535
2636
|
|
|
2536
2637
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
2537
2638
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
|
2538
2639
|
|
|
2539
2640
|
for (int i = 0; i < nb; i++) {
|
|
2540
|
-
const block_q4_1 *
|
|
2541
|
-
const block_q4_1 *
|
|
2542
|
-
const block_q8_1 *
|
|
2543
|
-
const block_q8_1 *
|
|
2641
|
+
const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
|
|
2642
|
+
const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
|
|
2643
|
+
const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
|
|
2644
|
+
const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
2544
2645
|
|
|
2545
2646
|
float32_t summs_t[4] = {
|
|
2546
2647
|
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
|
@@ -2614,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2614
2715
|
float summs = 0;
|
|
2615
2716
|
|
|
2616
2717
|
for (; ib + 1 < nb; ib += 2) {
|
|
2617
|
-
const block_q4_1 *
|
|
2618
|
-
const block_q4_1 *
|
|
2619
|
-
const block_q8_1 *
|
|
2620
|
-
const block_q8_1 *
|
|
2718
|
+
const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2719
|
+
const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2720
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2721
|
+
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2621
2722
|
|
|
2622
2723
|
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
|
2623
2724
|
|
|
@@ -2781,6 +2882,35 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2781
2882
|
}
|
|
2782
2883
|
|
|
2783
2884
|
sumf = hsum_float_8(acc) + summs;
|
|
2885
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
2886
|
+
float summs = 0;
|
|
2887
|
+
float32x4_t acc = vec_splats(0.0f);
|
|
2888
|
+
|
|
2889
|
+
const uint8x16_t v_m = vec_splat_u8(0x0F);
|
|
2890
|
+
|
|
2891
|
+
#pragma GCC unroll 4
|
|
2892
|
+
for (; ib < nb; ++ib) {
|
|
2893
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
2894
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
2895
|
+
|
|
2896
|
+
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
|
2897
|
+
|
|
2898
|
+
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
|
2899
|
+
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
|
2900
|
+
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
|
|
2901
|
+
|
|
2902
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
|
2903
|
+
const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
|
|
2904
|
+
|
|
2905
|
+
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
2906
|
+
const float32x4_t v_xy = vec_float(v_xy_);
|
|
2907
|
+
|
|
2908
|
+
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
2909
|
+
|
|
2910
|
+
acc = vec_madd(v_xy, v_d, acc);
|
|
2911
|
+
}
|
|
2912
|
+
|
|
2913
|
+
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
|
|
2784
2914
|
#endif
|
|
2785
2915
|
for (; ib < nb; ++ib) {
|
|
2786
2916
|
int sumi0 = 0;
|
|
@@ -2801,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2801
2931
|
*s = sumf;
|
|
2802
2932
|
}
|
|
2803
2933
|
|
|
2804
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float *
|
|
2934
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
2805
2935
|
const int qk = QK8_0;
|
|
2806
2936
|
const int nb = n / qk;
|
|
2807
2937
|
|
|
@@ -2816,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2816
2946
|
UNUSED(by);
|
|
2817
2947
|
UNUSED(bs);
|
|
2818
2948
|
|
|
2819
|
-
const block_q5_0 *
|
|
2820
|
-
const block_q8_0 *
|
|
2949
|
+
const block_q5_0 * GGML_RESTRICT x = vx;
|
|
2950
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
2821
2951
|
|
|
2822
2952
|
#if defined(__ARM_NEON)
|
|
2823
2953
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
@@ -2830,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2830
2960
|
uint64_t tmp1[4];
|
|
2831
2961
|
|
|
2832
2962
|
for (; ib + 1 < nb; ib += 2) {
|
|
2833
|
-
const block_q5_0 *
|
|
2834
|
-
const block_q5_0 *
|
|
2835
|
-
const block_q8_0 *
|
|
2836
|
-
const block_q8_0 *
|
|
2963
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
|
|
2964
|
+
const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2965
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
2966
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2837
2967
|
|
|
2838
2968
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
2839
2969
|
|
|
@@ -2894,8 +3024,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2894
3024
|
|
|
2895
3025
|
// TODO: check if unrolling this is better
|
|
2896
3026
|
for (; ib < nb; ++ib) {
|
|
2897
|
-
const block_q5_0 *
|
|
2898
|
-
const block_q8_0 *
|
|
3027
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
|
|
3028
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
2899
3029
|
|
|
2900
3030
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
|
2901
3031
|
|
|
@@ -3156,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3156
3286
|
*s = sumf;
|
|
3157
3287
|
}
|
|
3158
3288
|
|
|
3159
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float *
|
|
3289
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3160
3290
|
const int qk = QK8_1;
|
|
3161
3291
|
const int nb = n / qk;
|
|
3162
3292
|
|
|
@@ -3171,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3171
3301
|
UNUSED(by);
|
|
3172
3302
|
UNUSED(bs);
|
|
3173
3303
|
|
|
3174
|
-
const block_q5_1 *
|
|
3175
|
-
const block_q8_1 *
|
|
3304
|
+
const block_q5_1 * GGML_RESTRICT x = vx;
|
|
3305
|
+
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
3176
3306
|
|
|
3177
3307
|
#if defined(__ARM_NEON)
|
|
3178
3308
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
@@ -3188,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3188
3318
|
uint64_t tmp1[4];
|
|
3189
3319
|
|
|
3190
3320
|
for (; ib + 1 < nb; ib += 2) {
|
|
3191
|
-
const block_q5_1 *
|
|
3192
|
-
const block_q5_1 *
|
|
3193
|
-
const block_q8_1 *
|
|
3194
|
-
const block_q8_1 *
|
|
3321
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
|
3322
|
+
const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3323
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
|
3324
|
+
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3195
3325
|
|
|
3196
3326
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
3197
3327
|
|
|
@@ -3257,8 +3387,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3257
3387
|
|
|
3258
3388
|
// TODO: check if unrolling this is better
|
|
3259
3389
|
for (; ib < nb; ++ib) {
|
|
3260
|
-
const block_q5_1 *
|
|
3261
|
-
const block_q8_1 *
|
|
3390
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
|
3391
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
|
3262
3392
|
|
|
3263
3393
|
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
|
3264
3394
|
|
|
@@ -3530,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3530
3660
|
*s = sumf;
|
|
3531
3661
|
}
|
|
3532
3662
|
|
|
3533
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float *
|
|
3663
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3534
3664
|
const int qk = QK8_0;
|
|
3535
3665
|
const int nb = n / qk;
|
|
3536
3666
|
|
|
@@ -3545,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3545
3675
|
UNUSED(by);
|
|
3546
3676
|
UNUSED(bs);
|
|
3547
3677
|
|
|
3548
|
-
const block_q8_0 *
|
|
3549
|
-
const block_q8_0 *
|
|
3678
|
+
const block_q8_0 * GGML_RESTRICT x = vx;
|
|
3679
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
3550
3680
|
|
|
3551
3681
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
3552
3682
|
if (nrc == 2) {
|
|
3553
|
-
const block_q8_0 *
|
|
3554
|
-
const block_q8_0 *
|
|
3555
|
-
const block_q8_0 *
|
|
3556
|
-
const block_q8_0 *
|
|
3683
|
+
const block_q8_0 * GGML_RESTRICT vx0 = vx;
|
|
3684
|
+
const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
|
3685
|
+
const block_q8_0 * GGML_RESTRICT vy0 = vy;
|
|
3686
|
+
const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
|
3557
3687
|
|
|
3558
3688
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
3559
3689
|
|
|
3560
3690
|
for (int i = 0; i < nb; i++) {
|
|
3561
|
-
const block_q8_0 *
|
|
3562
|
-
const block_q8_0 *
|
|
3691
|
+
const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
|
|
3692
|
+
const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
|
|
3563
3693
|
|
|
3564
|
-
const block_q8_0 *
|
|
3565
|
-
const block_q8_0 *
|
|
3694
|
+
const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
|
|
3695
|
+
const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
3566
3696
|
|
|
3567
3697
|
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
|
3568
3698
|
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
|
@@ -3627,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3627
3757
|
const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
|
|
3628
3758
|
|
|
3629
3759
|
for (; ib + 1 < nb; ib += 2) {
|
|
3630
|
-
const block_q8_0 *
|
|
3631
|
-
const block_q8_0 *
|
|
3632
|
-
const block_q8_0 *
|
|
3633
|
-
const block_q8_0 *
|
|
3760
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3761
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3762
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3763
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3634
3764
|
|
|
3635
3765
|
// load x
|
|
3636
3766
|
const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
|
|
@@ -3658,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3658
3788
|
{
|
|
3659
3789
|
//printf("sve256");
|
|
3660
3790
|
for (; ib + 1 < nb; ib += 2) {
|
|
3661
|
-
const block_q8_0 *
|
|
3662
|
-
const block_q8_0 *
|
|
3663
|
-
const block_q8_0 *
|
|
3664
|
-
const block_q8_0 *
|
|
3791
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3792
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3793
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3794
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3665
3795
|
|
|
3666
3796
|
// load x
|
|
3667
3797
|
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
|
@@ -3694,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3694
3824
|
svfloat32_t sumv00 = svdup_n_f32(0.0f);
|
|
3695
3825
|
|
|
3696
3826
|
for (; ib + 1 < nb; ib += 2) {
|
|
3697
|
-
const block_q8_0 *
|
|
3698
|
-
const block_q8_0 *
|
|
3699
|
-
const block_q8_0 *
|
|
3700
|
-
const block_q8_0 *
|
|
3827
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3828
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3829
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3830
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3701
3831
|
|
|
3702
3832
|
//load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
|
|
3703
3833
|
// and add them to make one 64 element vector
|
|
@@ -3737,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3737
3867
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
|
3738
3868
|
|
|
3739
3869
|
for (; ib + 1 < nb; ib += 2) {
|
|
3740
|
-
const block_q8_0 *
|
|
3741
|
-
const block_q8_0 *
|
|
3742
|
-
const block_q8_0 *
|
|
3743
|
-
const block_q8_0 *
|
|
3870
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3871
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3872
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3873
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3744
3874
|
|
|
3745
3875
|
const int8x16_t x0_0 = vld1q_s8(x0->qs);
|
|
3746
3876
|
const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
|
|
@@ -3767,8 +3897,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3767
3897
|
v128_t sumv = wasm_f32x4_splat(0.0f);
|
|
3768
3898
|
|
|
3769
3899
|
for (; ib < nb; ++ib) {
|
|
3770
|
-
const block_q8_0 *
|
|
3771
|
-
const block_q8_0 *
|
|
3900
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
|
|
3901
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
3772
3902
|
|
|
3773
3903
|
const v128_t x0_0 = wasm_v128_load(x0->qs);
|
|
3774
3904
|
const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
|
|
@@ -3915,6 +4045,27 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3915
4045
|
}
|
|
3916
4046
|
|
|
3917
4047
|
sumf = hsum_float_8(acc);
|
|
4048
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
4049
|
+
__vector float acc = vec_splats(0.0f);
|
|
4050
|
+
|
|
4051
|
+
#pragma GCC unroll 8
|
|
4052
|
+
for (; ib < nb; ++ib) {
|
|
4053
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
4054
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
4055
|
+
|
|
4056
|
+
const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
|
|
4057
|
+
const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
|
|
4058
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
|
4059
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
|
4060
|
+
|
|
4061
|
+
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
4062
|
+
const float32x4_t v_xy = vec_float(v_xy_);
|
|
4063
|
+
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
4064
|
+
|
|
4065
|
+
acc = vec_madd(v_xy, v_d, acc);
|
|
4066
|
+
}
|
|
4067
|
+
|
|
4068
|
+
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
|
3918
4069
|
#endif
|
|
3919
4070
|
for (; ib < nb; ++ib) {
|
|
3920
4071
|
int sumi = 0;
|
|
@@ -3929,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3929
4080
|
*s = sumf;
|
|
3930
4081
|
}
|
|
3931
4082
|
|
|
3932
|
-
void ggml_vec_dot_tq1_0_q8_K(int n, float *
|
|
4083
|
+
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3933
4084
|
assert(nrc == 1);
|
|
3934
4085
|
UNUSED(nrc);
|
|
3935
4086
|
UNUSED(bx);
|
|
3936
4087
|
UNUSED(by);
|
|
3937
4088
|
UNUSED(bs);
|
|
3938
4089
|
|
|
3939
|
-
const block_tq1_0 *
|
|
3940
|
-
const block_q8_K *
|
|
4090
|
+
const block_tq1_0 * GGML_RESTRICT x = vx;
|
|
4091
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
3941
4092
|
|
|
3942
4093
|
const int nb = n / QK_K;
|
|
3943
4094
|
|
|
@@ -4252,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
4252
4403
|
#endif
|
|
4253
4404
|
}
|
|
4254
4405
|
|
|
4255
|
-
void ggml_vec_dot_tq2_0_q8_K(int n, float *
|
|
4406
|
+
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
4256
4407
|
assert(nrc == 1);
|
|
4257
4408
|
UNUSED(nrc);
|
|
4258
4409
|
UNUSED(bx);
|
|
4259
4410
|
UNUSED(by);
|
|
4260
4411
|
UNUSED(bs);
|
|
4261
4412
|
|
|
4262
|
-
const block_tq2_0 *
|
|
4263
|
-
const block_q8_K *
|
|
4413
|
+
const block_tq2_0 * GGML_RESTRICT x = vx;
|
|
4414
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
4264
4415
|
|
|
4265
4416
|
const int nb = n / QK_K;
|
|
4266
4417
|
|
|
@@ -4424,19 +4575,264 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
4424
4575
|
#endif
|
|
4425
4576
|
}
|
|
4426
4577
|
|
|
4427
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float *
|
|
4578
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
4428
4579
|
assert(nrc == 1);
|
|
4429
4580
|
UNUSED(nrc);
|
|
4430
4581
|
UNUSED(bx);
|
|
4431
4582
|
UNUSED(by);
|
|
4432
4583
|
UNUSED(bs);
|
|
4433
4584
|
|
|
4434
|
-
const block_q2_K *
|
|
4435
|
-
const block_q8_K *
|
|
4585
|
+
const block_q2_K * GGML_RESTRICT x = vx;
|
|
4586
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
4436
4587
|
|
|
4437
4588
|
const int nb = n / QK_K;
|
|
4438
4589
|
|
|
4439
|
-
#ifdef
|
|
4590
|
+
#ifdef __ARM_FEATURE_SVE
|
|
4591
|
+
const int vector_length = svcntb()*8;
|
|
4592
|
+
const svuint8_t m3s = svdup_n_u8(0x3);
|
|
4593
|
+
const svuint32_t m4s = svdup_n_u32(0xF);
|
|
4594
|
+
const svint32_t vzero_sv = svdup_n_s32(0);
|
|
4595
|
+
svfloat32_t acc_sum = svdup_n_f32(0);
|
|
4596
|
+
svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
|
|
4597
|
+
|
|
4598
|
+
switch (vector_length) {
|
|
4599
|
+
case 128:
|
|
4600
|
+
for (int i = 0; i < nb; ++i) {
|
|
4601
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4602
|
+
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
|
4603
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4604
|
+
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
|
4605
|
+
|
|
4606
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4607
|
+
const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
|
|
4608
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
4609
|
+
|
|
4610
|
+
svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
|
|
4611
|
+
const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4612
|
+
|
|
4613
|
+
mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
|
|
4614
|
+
const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4615
|
+
|
|
4616
|
+
svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
|
|
4617
|
+
svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
|
|
4618
|
+
|
|
4619
|
+
const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
|
|
4620
|
+
|
|
4621
|
+
mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
|
|
4622
|
+
const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4623
|
+
|
|
4624
|
+
mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
|
|
4625
|
+
const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4626
|
+
|
|
4627
|
+
q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
|
|
4628
|
+
q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
|
|
4629
|
+
|
|
4630
|
+
svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
|
|
4631
|
+
|
|
4632
|
+
svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
|
|
4633
|
+
|
|
4634
|
+
acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
|
|
4635
|
+
|
|
4636
|
+
svint32_t sumi1 = svdup_n_s32(0);
|
|
4637
|
+
|
|
4638
|
+
{
|
|
4639
|
+
const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
|
|
4640
|
+
svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
|
|
4641
|
+
svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4642
|
+
const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
|
|
4643
|
+
|
|
4644
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
|
|
4645
|
+
|
|
4646
|
+
const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
|
|
4647
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
|
|
4648
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4649
|
+
|
|
4650
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
|
|
4651
|
+
|
|
4652
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
|
|
4653
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4654
|
+
|
|
4655
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
|
|
4656
|
+
|
|
4657
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
|
|
4658
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4659
|
+
|
|
4660
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
|
|
4661
|
+
|
|
4662
|
+
|
|
4663
|
+
const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
|
|
4664
|
+
|
|
4665
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
|
|
4666
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4667
|
+
|
|
4668
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
|
|
4669
|
+
|
|
4670
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
|
|
4671
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4672
|
+
|
|
4673
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
|
|
4674
|
+
|
|
4675
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
|
|
4676
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4677
|
+
|
|
4678
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
|
|
4679
|
+
|
|
4680
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
|
|
4681
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4682
|
+
|
|
4683
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
|
|
4684
|
+
|
|
4685
|
+
//-------------------------------
|
|
4686
|
+
|
|
4687
|
+
q2 += 32;
|
|
4688
|
+
const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
|
|
4689
|
+
const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
|
|
4690
|
+
|
|
4691
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
|
|
4692
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4693
|
+
|
|
4694
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
|
|
4695
|
+
|
|
4696
|
+
const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
|
|
4697
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
|
|
4698
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4699
|
+
|
|
4700
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
|
|
4701
|
+
|
|
4702
|
+
|
|
4703
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
|
|
4704
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4705
|
+
|
|
4706
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
|
|
4707
|
+
|
|
4708
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
|
|
4709
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4710
|
+
|
|
4711
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
|
|
4712
|
+
|
|
4713
|
+
|
|
4714
|
+
const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
|
|
4715
|
+
|
|
4716
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
|
|
4717
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4718
|
+
|
|
4719
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
|
|
4720
|
+
|
|
4721
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
|
|
4722
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4723
|
+
|
|
4724
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
|
|
4725
|
+
|
|
4726
|
+
|
|
4727
|
+
|
|
4728
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
|
|
4729
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4730
|
+
|
|
4731
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
|
|
4732
|
+
|
|
4733
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
|
|
4734
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4735
|
+
|
|
4736
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
|
|
4737
|
+
}
|
|
4738
|
+
acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
|
|
4739
|
+
}
|
|
4740
|
+
*s = svaddv_f32(svptrue_b32(), acc_sum);
|
|
4741
|
+
break;
|
|
4742
|
+
|
|
4743
|
+
case 256:
|
|
4744
|
+
case 512:
|
|
4745
|
+
for (int i = 0; i < nb; ++i) {
|
|
4746
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4747
|
+
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
|
4748
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4749
|
+
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
|
4750
|
+
|
|
4751
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4752
|
+
const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
|
|
4753
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
4754
|
+
|
|
4755
|
+
const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
|
|
4756
|
+
const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
|
|
4757
|
+
const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
|
|
4758
|
+
svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
|
|
4759
|
+
|
|
4760
|
+
const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
|
|
4761
|
+
const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
|
|
4762
|
+
const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
|
|
4763
|
+
|
|
4764
|
+
svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
|
|
4765
|
+
|
|
4766
|
+
svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
|
|
4767
|
+
|
|
4768
|
+
acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
|
|
4769
|
+
|
|
4770
|
+
svint32_t sumi1 = svdup_n_s32(0);
|
|
4771
|
+
|
|
4772
|
+
{
|
|
4773
|
+
const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
|
|
4774
|
+
svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
|
|
4775
|
+
svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4776
|
+
|
|
4777
|
+
svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
|
|
4778
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4779
|
+
|
|
4780
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
|
|
4781
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4782
|
+
|
|
4783
|
+
svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
|
|
4784
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
|
|
4785
|
+
|
|
4786
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
|
|
4787
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4788
|
+
|
|
4789
|
+
scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
|
|
4790
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4791
|
+
|
|
4792
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
|
|
4793
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4794
|
+
|
|
4795
|
+
scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
|
|
4796
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
|
|
4797
|
+
|
|
4798
|
+
q2 += 32;
|
|
4799
|
+
|
|
4800
|
+
const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
|
|
4801
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
|
|
4802
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4803
|
+
|
|
4804
|
+
scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
|
|
4805
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4806
|
+
|
|
4807
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
|
|
4808
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4809
|
+
|
|
4810
|
+
scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
|
|
4811
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
|
|
4812
|
+
|
|
4813
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
|
|
4814
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4815
|
+
|
|
4816
|
+
scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
|
|
4817
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4818
|
+
|
|
4819
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
|
|
4820
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4821
|
+
|
|
4822
|
+
scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
|
|
4823
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
|
|
4824
|
+
}
|
|
4825
|
+
acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
|
|
4826
|
+
}
|
|
4827
|
+
*s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
|
|
4828
|
+
break;
|
|
4829
|
+
|
|
4830
|
+
default:
|
|
4831
|
+
assert(false && "Unsupported vector length");
|
|
4832
|
+
break;
|
|
4833
|
+
}
|
|
4834
|
+
|
|
4835
|
+
#elif __ARM_NEON
|
|
4440
4836
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
|
4441
4837
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
|
4442
4838
|
|
|
@@ -4451,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4451
4847
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4452
4848
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4453
4849
|
|
|
4454
|
-
const uint8_t *
|
|
4455
|
-
const int8_t *
|
|
4456
|
-
const uint8_t *
|
|
4850
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4851
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4852
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
4457
4853
|
|
|
4458
4854
|
const uint8x16_t mins_and_scales = vld1q_u8(sc);
|
|
4459
4855
|
const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
|
|
@@ -4516,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4516
4912
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4517
4913
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4518
4914
|
|
|
4519
|
-
const uint8_t *
|
|
4520
|
-
const int8_t *
|
|
4915
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4916
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4521
4917
|
|
|
4522
4918
|
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
4523
4919
|
const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
|
|
@@ -4583,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4583
4979
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4584
4980
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4585
4981
|
|
|
4586
|
-
const uint8_t *
|
|
4587
|
-
const int8_t *
|
|
4982
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4983
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4588
4984
|
|
|
4589
4985
|
// load mins and scales from block_q2_K.scales[QK_K/16]
|
|
4590
4986
|
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
@@ -4910,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4910
5306
|
vector signed int vsumi6 = v0;
|
|
4911
5307
|
vector signed int vsumi7 = v0;
|
|
4912
5308
|
|
|
4913
|
-
const uint8_t *
|
|
4914
|
-
const int8_t *
|
|
5309
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
5310
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4915
5311
|
|
|
4916
5312
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
4917
5313
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -5002,8 +5398,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5002
5398
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5003
5399
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
5004
5400
|
|
|
5005
|
-
const uint8_t *
|
|
5006
|
-
const int8_t *
|
|
5401
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
5402
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5007
5403
|
|
|
5008
5404
|
const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
|
|
5009
5405
|
const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
|
|
@@ -5096,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5096
5492
|
#endif
|
|
5097
5493
|
}
|
|
5098
5494
|
|
|
5099
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float *
|
|
5495
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
5100
5496
|
assert(n % QK_K == 0);
|
|
5101
5497
|
assert(nrc == 1);
|
|
5102
5498
|
UNUSED(nrc);
|
|
@@ -5107,12 +5503,187 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5107
5503
|
const uint32_t kmask1 = 0x03030303;
|
|
5108
5504
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
|
5109
5505
|
|
|
5110
|
-
const block_q3_K *
|
|
5111
|
-
const block_q8_K *
|
|
5506
|
+
const block_q3_K * GGML_RESTRICT x = vx;
|
|
5507
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
5112
5508
|
|
|
5113
5509
|
const int nb = n / QK_K;
|
|
5114
5510
|
|
|
5115
|
-
#
|
|
5511
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
5512
|
+
|
|
5513
|
+
uint32_t aux[3];
|
|
5514
|
+
uint32_t utmp[4];
|
|
5515
|
+
|
|
5516
|
+
const int8_t m32 = 32;
|
|
5517
|
+
const int vector_length = svcntb()*8;
|
|
5518
|
+
const svuint8_t m3b_sv = svdup_n_u8(0x3);
|
|
5519
|
+
const svint32_t vzero_sv = svdup_n_s32(0);
|
|
5520
|
+
|
|
5521
|
+
const svuint8_t m0_sv = svdup_n_u8(1);
|
|
5522
|
+
const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
|
|
5523
|
+
const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
|
|
5524
|
+
const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
|
|
5525
|
+
|
|
5526
|
+
float sum = 0;
|
|
5527
|
+
|
|
5528
|
+
for (int i = 0; i < nb; ++i) {
|
|
5529
|
+
|
|
5530
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5531
|
+
|
|
5532
|
+
const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
|
|
5533
|
+
const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
|
|
5534
|
+
const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
|
|
5535
|
+
|
|
5536
|
+
// Set up scales
|
|
5537
|
+
memcpy(aux, x[i].scales, 12);
|
|
5538
|
+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
|
5539
|
+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
|
5540
|
+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
|
5541
|
+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
|
5542
|
+
|
|
5543
|
+
int8_t * scale = (int8_t *)utmp;
|
|
5544
|
+
|
|
5545
|
+
for (int j = 0; j < 16; ++j) scale[j] -= m32;
|
|
5546
|
+
|
|
5547
|
+
switch (vector_length) {
|
|
5548
|
+
case 128:
|
|
5549
|
+
{
|
|
5550
|
+
svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
|
|
5551
|
+
svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
|
|
5552
|
+
svuint8_t q3h_sv;
|
|
5553
|
+
|
|
5554
|
+
svint32_t sumi1_1 = svdup_n_s32(0);
|
|
5555
|
+
svint8_t q3bytes_sv;
|
|
5556
|
+
|
|
5557
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
|
5558
|
+
|
|
5559
|
+
const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
|
|
5560
|
+
const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
|
|
5561
|
+
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5562
|
+
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5563
|
+
|
|
5564
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
|
|
5565
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5566
|
+
|
|
5567
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
|
|
5568
|
+
|
|
5569
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
|
|
5570
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5571
|
+
|
|
5572
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
|
|
5573
|
+
|
|
5574
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5575
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5576
|
+
|
|
5577
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
|
|
5578
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5579
|
+
|
|
5580
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
|
|
5581
|
+
|
|
5582
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
|
|
5583
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5584
|
+
|
|
5585
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
|
|
5586
|
+
|
|
5587
|
+
|
|
5588
|
+
scale += 4;
|
|
5589
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5590
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5591
|
+
|
|
5592
|
+
q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
|
|
5593
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5594
|
+
|
|
5595
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
|
|
5596
|
+
|
|
5597
|
+
q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
|
|
5598
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5599
|
+
|
|
5600
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
|
|
5601
|
+
|
|
5602
|
+
|
|
5603
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5604
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5605
|
+
|
|
5606
|
+
q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
|
|
5607
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5608
|
+
|
|
5609
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
|
|
5610
|
+
|
|
5611
|
+
q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
|
|
5612
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5613
|
+
|
|
5614
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
|
|
5615
|
+
|
|
5616
|
+
if (j == 0) {
|
|
5617
|
+
qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
|
|
5618
|
+
qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
|
|
5619
|
+
}
|
|
5620
|
+
|
|
5621
|
+
scale += 4;
|
|
5622
|
+
}
|
|
5623
|
+
|
|
5624
|
+
sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
|
|
5625
|
+
} break;
|
|
5626
|
+
case 256:
|
|
5627
|
+
case 512:
|
|
5628
|
+
{
|
|
5629
|
+
svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
|
|
5630
|
+
svuint8_t q3h_sv;
|
|
5631
|
+
|
|
5632
|
+
svint32_t sumi1_1 = svdup_n_s32(0);
|
|
5633
|
+
svint8_t q3bytes_sv;
|
|
5634
|
+
|
|
5635
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
|
5636
|
+
|
|
5637
|
+
const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
|
|
5638
|
+
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5639
|
+
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5640
|
+
|
|
5641
|
+
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
|
|
5642
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5643
|
+
|
|
5644
|
+
|
|
5645
|
+
svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
|
|
5646
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
|
|
5647
|
+
|
|
5648
|
+
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
|
|
5649
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5650
|
+
|
|
5651
|
+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
|
|
5652
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
|
|
5653
|
+
|
|
5654
|
+
scale += 4;
|
|
5655
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5656
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5657
|
+
|
|
5658
|
+
q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
|
|
5659
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5660
|
+
|
|
5661
|
+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
|
|
5662
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
|
|
5663
|
+
|
|
5664
|
+
q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
|
|
5665
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5666
|
+
|
|
5667
|
+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
|
|
5668
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
|
|
5669
|
+
|
|
5670
|
+
if (j == 0) {
|
|
5671
|
+
qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
|
|
5672
|
+
}
|
|
5673
|
+
|
|
5674
|
+
scale += 4;
|
|
5675
|
+
}
|
|
5676
|
+
|
|
5677
|
+
sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
|
|
5678
|
+
} break;
|
|
5679
|
+
default:
|
|
5680
|
+
assert(false && "Unsupported vector length");
|
|
5681
|
+
break;
|
|
5682
|
+
}
|
|
5683
|
+
}
|
|
5684
|
+
*s = sum;
|
|
5685
|
+
|
|
5686
|
+
#elif __ARM_NEON
|
|
5116
5687
|
|
|
5117
5688
|
uint32_t aux[3];
|
|
5118
5689
|
uint32_t utmp[4];
|
|
@@ -5134,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5134
5705
|
|
|
5135
5706
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5136
5707
|
|
|
5137
|
-
const uint8_t *
|
|
5138
|
-
const uint8_t *
|
|
5139
|
-
const int8_t *
|
|
5708
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
5709
|
+
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
|
|
5710
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5140
5711
|
|
|
5141
5712
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
|
5142
5713
|
|
|
@@ -5220,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5220
5791
|
|
|
5221
5792
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5222
5793
|
|
|
5223
|
-
const uint8_t *
|
|
5224
|
-
const int8_t *
|
|
5794
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
5795
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5225
5796
|
|
|
5226
5797
|
// Set up scales
|
|
5227
5798
|
memcpy(aux, x[i].scales, 12);
|
|
@@ -5325,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5325
5896
|
|
|
5326
5897
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5327
5898
|
|
|
5328
|
-
const uint8_t *
|
|
5329
|
-
const int8_t *
|
|
5899
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
5900
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5330
5901
|
|
|
5331
5902
|
// Set up scales
|
|
5332
5903
|
aux = (const uint32_t *)x[i].scales;
|
|
@@ -5459,9 +6030,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5459
6030
|
|
|
5460
6031
|
float sumf = 0;
|
|
5461
6032
|
for (int i = 0; i < nb; ++i) {
|
|
5462
|
-
const uint8_t *
|
|
5463
|
-
const uint8_t *
|
|
5464
|
-
const int8_t *
|
|
6033
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6034
|
+
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
6035
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5465
6036
|
|
|
5466
6037
|
// Process blocks with SIMD
|
|
5467
6038
|
int8_t * a = aux8;
|
|
@@ -5548,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5548
6119
|
float sumf = 0;
|
|
5549
6120
|
for (int i = 0; i < nb; ++i) {
|
|
5550
6121
|
|
|
5551
|
-
const uint8_t *
|
|
5552
|
-
const uint8_t *
|
|
5553
|
-
const int8_t *
|
|
6122
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6123
|
+
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
|
|
6124
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5554
6125
|
|
|
5555
6126
|
memcpy(aux, x[i].scales, 12);
|
|
5556
6127
|
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
|
@@ -5690,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5690
6261
|
vector signed int vsumi6 = v0;
|
|
5691
6262
|
vector signed int vsumi7 = v0;
|
|
5692
6263
|
|
|
5693
|
-
const uint8_t *
|
|
5694
|
-
const int8_t *
|
|
6264
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6265
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5695
6266
|
|
|
5696
6267
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
5697
6268
|
__builtin_prefetch(q3, 0, 1);
|
|
@@ -5804,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5804
6375
|
for (int i = 0; i < nb; ++i) {
|
|
5805
6376
|
|
|
5806
6377
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5807
|
-
const uint8_t *
|
|
5808
|
-
const int8_t *
|
|
6378
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6379
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5809
6380
|
// Set up scales
|
|
5810
6381
|
memcpy(aux, x[i].scales, 12);
|
|
5811
6382
|
__m128i scales128 = lsx_set_w(
|
|
@@ -5890,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5890
6461
|
|
|
5891
6462
|
float sumf = 0;
|
|
5892
6463
|
for (int i = 0; i < nb; ++i) {
|
|
5893
|
-
const uint8_t *
|
|
5894
|
-
const uint8_t *
|
|
5895
|
-
const int8_t *
|
|
6464
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6465
|
+
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
6466
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5896
6467
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
5897
|
-
int8_t *
|
|
6468
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
5898
6469
|
uint8_t m = 1;
|
|
5899
6470
|
for (int j = 0; j < QK_K; j += 128) {
|
|
5900
6471
|
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
@@ -5937,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5937
6508
|
|
|
5938
6509
|
}
|
|
5939
6510
|
|
|
5940
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float *
|
|
6511
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
5941
6512
|
assert(n % QK_K == 0);
|
|
5942
6513
|
assert(nrc == 1);
|
|
5943
6514
|
UNUSED(nrc);
|
|
@@ -5945,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5945
6516
|
UNUSED(by);
|
|
5946
6517
|
UNUSED(bs);
|
|
5947
6518
|
|
|
5948
|
-
const block_q4_K *
|
|
5949
|
-
const block_q8_K *
|
|
6519
|
+
const block_q4_K * GGML_RESTRICT x = vx;
|
|
6520
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
5950
6521
|
|
|
5951
6522
|
const int nb = n / QK_K;
|
|
5952
6523
|
|
|
@@ -5981,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5981
6552
|
|
|
5982
6553
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
5983
6554
|
|
|
5984
|
-
const uint8_t *
|
|
5985
|
-
const int8_t *
|
|
6555
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6556
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5986
6557
|
|
|
5987
6558
|
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
|
5988
6559
|
const svuint8_t m4b = svdup_n_u8(0xf);
|
|
@@ -6069,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6069
6640
|
|
|
6070
6641
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
6071
6642
|
|
|
6072
|
-
const uint8_t *
|
|
6073
|
-
const int8_t *
|
|
6643
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6644
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6074
6645
|
|
|
6075
6646
|
int32_t sumi1 = 0;
|
|
6076
6647
|
int32_t sumi2 = 0;
|
|
@@ -6108,8 +6679,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6108
6679
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6109
6680
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
|
|
6110
6681
|
|
|
6111
|
-
const uint8_t *
|
|
6112
|
-
const int8_t *
|
|
6682
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6683
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6113
6684
|
|
|
6114
6685
|
// Process scales and mins
|
|
6115
6686
|
memcpy(utmp, x[i].scales, 12);
|
|
@@ -6121,7 +6692,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6121
6692
|
|
|
6122
6693
|
// Sum mins * q8sums
|
|
6123
6694
|
int32_t sumi = 0;
|
|
6124
|
-
const int16_t *
|
|
6695
|
+
const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
|
|
6125
6696
|
const uint8_t * m = (const uint8_t *)&utmp[2];
|
|
6126
6697
|
for (int j = 0; j < 16; j += 2) {
|
|
6127
6698
|
sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
|
|
@@ -6220,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6220
6791
|
utmp[2] = uaux;
|
|
6221
6792
|
utmp[0] &= kmask1;
|
|
6222
6793
|
|
|
6223
|
-
const uint8_t *
|
|
6224
|
-
const int8_t *
|
|
6794
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6795
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6225
6796
|
|
|
6226
6797
|
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
|
|
6227
6798
|
|
|
@@ -6279,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6279
6850
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6280
6851
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
6281
6852
|
|
|
6282
|
-
const uint8_t *
|
|
6283
|
-
const int8_t *
|
|
6853
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6854
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6284
6855
|
|
|
6285
6856
|
memcpy(utmp, x[i].scales, 12);
|
|
6286
6857
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -6380,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6380
6951
|
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
|
6381
6952
|
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
|
6382
6953
|
|
|
6383
|
-
const uint8_t *
|
|
6384
|
-
const int8_t *
|
|
6954
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6955
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6385
6956
|
|
|
6386
6957
|
vl = 32;
|
|
6387
6958
|
|
|
@@ -6482,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6482
7053
|
vector signed int vsumi2 = v0;
|
|
6483
7054
|
vector signed int vsumi3 = v0;
|
|
6484
7055
|
|
|
6485
|
-
const uint8_t *
|
|
6486
|
-
const int8_t *
|
|
7056
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
7057
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6487
7058
|
|
|
6488
7059
|
for (int j = 0; j < QK_K/64; j+=2) {
|
|
6489
7060
|
__builtin_prefetch(q4, 0, 1);
|
|
@@ -6574,8 +7145,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6574
7145
|
utmp[2] = uaux;
|
|
6575
7146
|
utmp[0] &= kmask1;
|
|
6576
7147
|
|
|
6577
|
-
const uint8_t *
|
|
6578
|
-
const int8_t *
|
|
7148
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
7149
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6579
7150
|
|
|
6580
7151
|
const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
|
|
6581
7152
|
const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
|
|
@@ -6622,6 +7193,77 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6622
7193
|
|
|
6623
7194
|
|
|
6624
7195
|
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
7196
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
7197
|
+
const uint8x16_t v_lm = vec_splat_u8(0x0F);
|
|
7198
|
+
const int32x4_t v_z = vec_splat_s32(0);
|
|
7199
|
+
|
|
7200
|
+
uint8x16_t v_x[2];
|
|
7201
|
+
int8x16_t v_xl[2];
|
|
7202
|
+
int8x16_t v_y[2];
|
|
7203
|
+
|
|
7204
|
+
float sumf = 0;
|
|
7205
|
+
|
|
7206
|
+
for (int i = 0; i < nb; ++i) {
|
|
7207
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7208
|
+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
7209
|
+
|
|
7210
|
+
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
7211
|
+
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
7212
|
+
const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
|
|
7213
|
+
|
|
7214
|
+
memcpy(utmp, x[i].scales, 12);
|
|
7215
|
+
|
|
7216
|
+
uint32x4_t v_mins8 = { 0 };
|
|
7217
|
+
v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
|
|
7218
|
+
v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
|
|
7219
|
+
|
|
7220
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
7221
|
+
utmp[0] &= kmask1;
|
|
7222
|
+
|
|
7223
|
+
const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
|
|
7224
|
+
|
|
7225
|
+
const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
|
|
7226
|
+
const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
|
|
7227
|
+
const int32x4_t v_mins = v_minso + v_minse;
|
|
7228
|
+
sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
|
|
7229
|
+
|
|
7230
|
+
const uint8_t * scales = (const uint8_t *)utmp;
|
|
7231
|
+
const uint8_t * GGML_RESTRICT x0 = x[i].qs;
|
|
7232
|
+
const int8_t * GGML_RESTRICT y0 = y[i].qs;
|
|
7233
|
+
|
|
7234
|
+
int32_t sumi1 = 0;
|
|
7235
|
+
int32_t sumi2 = 0;
|
|
7236
|
+
|
|
7237
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
|
7238
|
+
v_x[0] = vec_xl(0 , x0);
|
|
7239
|
+
v_x[1] = vec_xl(16, x0);
|
|
7240
|
+
x0 += 32;
|
|
7241
|
+
|
|
7242
|
+
v_y[0] = vec_xl(0 , y0);
|
|
7243
|
+
v_y[1] = vec_xl(16, y0);
|
|
7244
|
+
y0 += 32;
|
|
7245
|
+
|
|
7246
|
+
v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
|
|
7247
|
+
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
|
|
7248
|
+
|
|
7249
|
+
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
7250
|
+
sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
|
|
7251
|
+
|
|
7252
|
+
v_y[0] = vec_xl(0 , y0);
|
|
7253
|
+
v_y[1] = vec_xl(16, y0);
|
|
7254
|
+
y0 += 32;
|
|
7255
|
+
|
|
7256
|
+
v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
|
|
7257
|
+
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
|
|
7258
|
+
|
|
7259
|
+
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
7260
|
+
sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
|
|
7261
|
+
}
|
|
7262
|
+
|
|
7263
|
+
sumf += d * (sumi1 + sumi2);
|
|
7264
|
+
}
|
|
7265
|
+
|
|
7266
|
+
*s = sumf;
|
|
6625
7267
|
#else
|
|
6626
7268
|
|
|
6627
7269
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
@@ -6635,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6635
7277
|
|
|
6636
7278
|
float sumf = 0;
|
|
6637
7279
|
for (int i = 0; i < nb; ++i) {
|
|
6638
|
-
const uint8_t *
|
|
6639
|
-
const int8_t *
|
|
7280
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
7281
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6640
7282
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
6641
|
-
int8_t *
|
|
7283
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
6642
7284
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
6643
7285
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
6644
7286
|
a += 32;
|
|
@@ -6681,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6681
7323
|
#endif
|
|
6682
7324
|
}
|
|
6683
7325
|
|
|
6684
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float *
|
|
7326
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
6685
7327
|
assert(n % QK_K == 0);
|
|
6686
7328
|
assert(nrc == 1);
|
|
6687
7329
|
UNUSED(nrc);
|
|
@@ -6689,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6689
7331
|
UNUSED(by);
|
|
6690
7332
|
UNUSED(bs);
|
|
6691
7333
|
|
|
6692
|
-
const block_q5_K *
|
|
6693
|
-
const block_q8_K *
|
|
7334
|
+
const block_q5_K * GGML_RESTRICT x = vx;
|
|
7335
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
6694
7336
|
|
|
6695
7337
|
const int nb = n / QK_K;
|
|
6696
7338
|
|
|
@@ -6732,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6732
7374
|
|
|
6733
7375
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
6734
7376
|
|
|
6735
|
-
const uint8_t *
|
|
6736
|
-
const uint8_t *
|
|
6737
|
-
const int8_t *
|
|
7377
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7378
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
7379
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6738
7380
|
|
|
6739
7381
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
|
6740
7382
|
|
|
@@ -6779,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6779
7421
|
float summs = 0.f;
|
|
6780
7422
|
|
|
6781
7423
|
for (int i = 0; i < nb; ++i) {
|
|
6782
|
-
const uint8_t *
|
|
6783
|
-
const int8_t *
|
|
7424
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7425
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6784
7426
|
|
|
6785
7427
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6786
7428
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
@@ -6863,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6863
7505
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6864
7506
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
6865
7507
|
|
|
6866
|
-
const uint8_t *
|
|
6867
|
-
const int8_t *
|
|
7508
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7509
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6868
7510
|
|
|
6869
7511
|
memcpy(utmp, x[i].scales, 12);
|
|
6870
7512
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -6955,9 +7597,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6955
7597
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6956
7598
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
|
|
6957
7599
|
|
|
6958
|
-
const uint8_t *
|
|
6959
|
-
const uint8_t *
|
|
6960
|
-
const int8_t *
|
|
7600
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7601
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
7602
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6961
7603
|
|
|
6962
7604
|
// Process scales and mins
|
|
6963
7605
|
memcpy(utmp, x[i].scales, 12);
|
|
@@ -6969,7 +7611,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6969
7611
|
|
|
6970
7612
|
// Sum mins * q8sums
|
|
6971
7613
|
int32_t sumi_mins = 0;
|
|
6972
|
-
const int16_t *
|
|
7614
|
+
const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
|
|
6973
7615
|
const uint8_t * m = (const uint8_t *)&utmp[2];
|
|
6974
7616
|
for (int j = 0; j < 16; j += 2) {
|
|
6975
7617
|
sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
|
|
@@ -7073,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7073
7715
|
|
|
7074
7716
|
vl = 8;
|
|
7075
7717
|
|
|
7076
|
-
const uint8_t *
|
|
7077
|
-
const uint8_t *
|
|
7078
|
-
const int8_t *
|
|
7718
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7719
|
+
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
7720
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7079
7721
|
|
|
7080
7722
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7081
7723
|
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
@@ -7214,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7214
7856
|
vector signed int vsumi2 = v0;
|
|
7215
7857
|
vector signed int vsumi3 = v0;
|
|
7216
7858
|
|
|
7217
|
-
const uint8_t *
|
|
7218
|
-
const int8_t *
|
|
7859
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7860
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7219
7861
|
|
|
7220
7862
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
7221
7863
|
__builtin_prefetch(q5, 0, 1);
|
|
@@ -7287,8 +7929,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7287
7929
|
|
|
7288
7930
|
for (int i = 0; i < nb; ++i) {
|
|
7289
7931
|
|
|
7290
|
-
const uint8_t *
|
|
7291
|
-
const int8_t *
|
|
7932
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7933
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7292
7934
|
|
|
7293
7935
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7294
7936
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
@@ -7351,7 +7993,94 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7351
7993
|
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
|
|
7352
7994
|
|
|
7353
7995
|
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
7996
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
7997
|
+
const uint8x16_t v_lm = vec_splat_u8(0x0F);
|
|
7998
|
+
const uint8x16_t v_1m = vec_splat_u8(0x01);
|
|
7999
|
+
const uint8x16_t v_2m = vec_splat_u8(0x02);
|
|
8000
|
+
|
|
8001
|
+
const int32x4_t v_z = vec_splat_s32(0);
|
|
8002
|
+
|
|
8003
|
+
const uchar8x16_t v_minsm = {
|
|
8004
|
+
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
8005
|
+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
|
|
8006
|
+
};
|
|
8007
|
+
|
|
8008
|
+
int8x16_t q5b[4];
|
|
8009
|
+
uint8x16_t q5h[4];
|
|
8010
|
+
|
|
8011
|
+
uint8x16_t v_xl[2];
|
|
8012
|
+
uint8x16_t v_xh[2];
|
|
8013
|
+
int8x16_t v_y[4];
|
|
8014
|
+
|
|
8015
|
+
float sumf = 0;
|
|
8016
|
+
|
|
8017
|
+
for (int i = 0; i < nb; ++i) {
|
|
8018
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
8019
|
+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
8020
|
+
|
|
8021
|
+
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
8022
|
+
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
8023
|
+
const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
|
|
8024
|
+
|
|
8025
|
+
memcpy(utmp, x[i].scales, 12);
|
|
8026
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
8027
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
|
8028
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
8029
|
+
utmp[2] = uaux;
|
|
8030
|
+
utmp[0] &= kmask1;
|
|
8031
|
+
|
|
8032
|
+
const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
|
|
8033
|
+
const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
|
|
8034
|
+
const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
|
|
8035
|
+
|
|
8036
|
+
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
|
|
8037
|
+
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
|
|
8038
|
+
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
|
|
8039
|
+
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
|
|
8040
|
+
|
|
8041
|
+
const uint8_t * scales = (const uint8_t *)utmp;
|
|
8042
|
+
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
|
|
8043
|
+
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
|
|
8044
|
+
const int8_t * GGML_RESTRICT y0 = y[i].qs;
|
|
8045
|
+
|
|
8046
|
+
v_xh[0] = vec_xl(0 , x0h);
|
|
8047
|
+
v_xh[1] = vec_xl(16, x0h);
|
|
8048
|
+
|
|
8049
|
+
int32_t sumi = 0;
|
|
8050
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
|
8051
|
+
v_xl[0] = vec_xl(0 , x0l);
|
|
8052
|
+
v_xl[1] = vec_xl(16, x0l);
|
|
8053
|
+
x0l += 32;
|
|
8054
|
+
|
|
8055
|
+
v_y[0] = vec_xl(0 , y0);
|
|
8056
|
+
v_y[1] = vec_xl(16, y0);
|
|
8057
|
+
v_y[2] = vec_xl(32, y0);
|
|
8058
|
+
v_y[3] = vec_xl(48, y0);
|
|
8059
|
+
y0 += 64;
|
|
8060
|
+
|
|
8061
|
+
q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
|
|
8062
|
+
q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
|
|
8063
|
+
q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
|
|
8064
|
+
q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
|
|
8065
|
+
v_xh[0] = vec_sr(v_xh[0], 2);
|
|
8066
|
+
v_xh[1] = vec_sr(v_xh[1], 2);
|
|
7354
8067
|
|
|
8068
|
+
q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
|
|
8069
|
+
q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
|
|
8070
|
+
q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
|
|
8071
|
+
q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
|
|
8072
|
+
|
|
8073
|
+
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
|
|
8074
|
+
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
|
|
8075
|
+
|
|
8076
|
+
sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
|
|
8077
|
+
sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
|
|
8078
|
+
}
|
|
8079
|
+
|
|
8080
|
+
sumf += d * sumi - dmin * mins;
|
|
8081
|
+
}
|
|
8082
|
+
|
|
8083
|
+
*s = sumf;
|
|
7355
8084
|
#else
|
|
7356
8085
|
|
|
7357
8086
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
@@ -7365,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7365
8094
|
|
|
7366
8095
|
float sumf = 0;
|
|
7367
8096
|
for (int i = 0; i < nb; ++i) {
|
|
7368
|
-
const uint8_t *
|
|
7369
|
-
const uint8_t *
|
|
7370
|
-
const int8_t *
|
|
8097
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
8098
|
+
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
8099
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7371
8100
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
7372
|
-
int8_t *
|
|
8101
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
7373
8102
|
uint8_t m = 1;
|
|
7374
8103
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
7375
8104
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
@@ -7416,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7416
8145
|
#endif
|
|
7417
8146
|
}
|
|
7418
8147
|
|
|
7419
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float *
|
|
8148
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
7420
8149
|
assert(n % QK_K == 0);
|
|
7421
8150
|
assert(nrc == 1);
|
|
7422
8151
|
UNUSED(nrc);
|
|
@@ -7424,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7424
8153
|
UNUSED(by);
|
|
7425
8154
|
UNUSED(bs);
|
|
7426
8155
|
|
|
7427
|
-
const block_q6_K *
|
|
7428
|
-
const block_q8_K *
|
|
8156
|
+
const block_q6_K * GGML_RESTRICT x = vx;
|
|
8157
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
7429
8158
|
|
|
7430
8159
|
const int nb = n / QK_K;
|
|
7431
8160
|
|
|
@@ -7445,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7445
8174
|
|
|
7446
8175
|
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
7447
8176
|
|
|
7448
|
-
const uint8_t *
|
|
7449
|
-
const uint8_t *
|
|
7450
|
-
const int8_t *
|
|
8177
|
+
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
8178
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8179
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7451
8180
|
|
|
7452
|
-
const int8_t *
|
|
8181
|
+
const int8_t * GGML_RESTRICT scale = x[i].scales;
|
|
7453
8182
|
|
|
7454
8183
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
|
7455
8184
|
const int8x16_t scales = vld1q_s8(scale);
|
|
@@ -7536,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7536
8265
|
|
|
7537
8266
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7538
8267
|
|
|
7539
|
-
const uint8_t *
|
|
7540
|
-
const uint8_t *
|
|
7541
|
-
const int8_t *
|
|
8268
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8269
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8270
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7542
8271
|
|
|
7543
8272
|
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
7544
8273
|
|
|
@@ -7614,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7614
8343
|
|
|
7615
8344
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7616
8345
|
|
|
7617
|
-
const uint8_t *
|
|
7618
|
-
const uint8_t *
|
|
7619
|
-
const int8_t *
|
|
8346
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8347
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8348
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7620
8349
|
|
|
7621
8350
|
// handle the q6_k -32 offset separately using bsums
|
|
7622
8351
|
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
|
|
@@ -7715,8 +8444,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7715
8444
|
|
|
7716
8445
|
for (int i = 0; i < nb; ++i) {
|
|
7717
8446
|
// Unpack 6-bit quantized data into aux8 (unchanged)
|
|
7718
|
-
const uint8_t *
|
|
7719
|
-
const uint8_t *
|
|
8447
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8448
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
7720
8449
|
int8_t * a = aux8;
|
|
7721
8450
|
for (int j = 0; j < QK_K; j += 128) {
|
|
7722
8451
|
for (int l = 0; l < 32; ++l) {
|
|
@@ -7730,8 +8459,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7730
8459
|
qh += 32;
|
|
7731
8460
|
}
|
|
7732
8461
|
|
|
7733
|
-
const int8_t *
|
|
7734
|
-
const int8_t *
|
|
8462
|
+
const int8_t * GGML_RESTRICT a_ptr = aux8;
|
|
8463
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7735
8464
|
v128_t acc0 = wasm_i32x4_splat(0);
|
|
7736
8465
|
v128_t acc1 = wasm_i32x4_splat(0);
|
|
7737
8466
|
|
|
@@ -7794,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7794
8523
|
|
|
7795
8524
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7796
8525
|
|
|
7797
|
-
const uint8_t *
|
|
7798
|
-
const uint8_t *
|
|
7799
|
-
const int8_t *
|
|
8526
|
+
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
8527
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8528
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7800
8529
|
|
|
7801
|
-
const int8_t *
|
|
8530
|
+
const int8_t * GGML_RESTRICT scale = x[i].scales;
|
|
7802
8531
|
|
|
7803
8532
|
size_t vl;
|
|
7804
8533
|
|
|
@@ -7900,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7900
8629
|
vector signed int vsumi6 = v0;
|
|
7901
8630
|
vector signed int vsumi7 = v0;
|
|
7902
8631
|
|
|
7903
|
-
const uint8_t *
|
|
7904
|
-
const uint8_t *
|
|
7905
|
-
const int8_t *
|
|
7906
|
-
const int8_t *
|
|
8632
|
+
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
8633
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8634
|
+
const int8_t * GGML_RESTRICT qs = x[i].scales;
|
|
8635
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7907
8636
|
|
|
7908
8637
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
7909
8638
|
__builtin_prefetch(q6, 0, 0);
|
|
@@ -8019,9 +8748,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
8019
8748
|
|
|
8020
8749
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
8021
8750
|
|
|
8022
|
-
const uint8_t *
|
|
8023
|
-
const uint8_t *
|
|
8024
|
-
const int8_t *
|
|
8751
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8752
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8753
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8025
8754
|
|
|
8026
8755
|
const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
|
|
8027
8756
|
const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
|
|
@@ -8068,7 +8797,130 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
8068
8797
|
}
|
|
8069
8798
|
|
|
8070
8799
|
*s = hsum_float_8(acc);
|
|
8800
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
8801
|
+
float sum = 0;
|
|
8802
|
+
|
|
8803
|
+
// Lower 4-bit and upper 2-bit masks
|
|
8804
|
+
const uint8x16_t v_lm = vec_splat_u8(0x0F);
|
|
8805
|
+
const uint8x16_t v_um = vec_splat_u8(0x03);
|
|
8806
|
+
|
|
8807
|
+
const int32x4_t v_z = vec_splat_s32(0);
|
|
8808
|
+
|
|
8809
|
+
int8x16_t q6b[4];
|
|
8810
|
+
uint8x16_t q6h[4];
|
|
8811
|
+
|
|
8812
|
+
uint8x16_t v_xl[4];
|
|
8813
|
+
uint8x16_t v_xh[2];
|
|
8814
|
+
int8x16_t v_y[4];
|
|
8815
|
+
|
|
8816
|
+
for (int i = 0; i < nb; ++i) {
|
|
8817
|
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
8818
|
+
|
|
8819
|
+
const uint8_t * GGML_RESTRICT x0l = x[i].ql;
|
|
8820
|
+
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
|
|
8821
|
+
const int8_t * GGML_RESTRICT y0 = y[i].qs;
|
|
8071
8822
|
|
|
8823
|
+
const int8_t * GGML_RESTRICT scale = x[i].scales;
|
|
8824
|
+
|
|
8825
|
+
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
8826
|
+
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
8827
|
+
|
|
8828
|
+
const int8x16_t v_scale = vec_xl(0, scale);
|
|
8829
|
+
const int16x8_t v_scalel = vec_unpackh(v_scale);
|
|
8830
|
+
const int16x8_t v_scaleh = vec_unpackl(v_scale);
|
|
8831
|
+
|
|
8832
|
+
const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
|
|
8833
|
+
const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
|
|
8834
|
+
const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
|
|
8835
|
+
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
|
|
8836
|
+
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
|
|
8837
|
+
|
|
8838
|
+
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
|
|
8839
|
+
|
|
8840
|
+
int32_t isum = 0;
|
|
8841
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
|
8842
|
+
// Load model upper 2 bits
|
|
8843
|
+
v_xh[0] = vec_xl(0 , x0h);
|
|
8844
|
+
v_xh[1] = vec_xl(16, x0h);
|
|
8845
|
+
x0h += 32;
|
|
8846
|
+
|
|
8847
|
+
// Load model lower 4 bits
|
|
8848
|
+
v_xl[0] = vec_xl(0 , x0l);
|
|
8849
|
+
v_xl[1] = vec_xl(16, x0l);
|
|
8850
|
+
v_xl[2] = vec_xl(32, x0l);
|
|
8851
|
+
v_xl[3] = vec_xl(48, x0l);
|
|
8852
|
+
x0l += 64;
|
|
8853
|
+
|
|
8854
|
+
// Load activation quants
|
|
8855
|
+
v_y[0] = vec_xl(0 , y0);
|
|
8856
|
+
v_y[1] = vec_xl(16, y0);
|
|
8857
|
+
v_y[2] = vec_xl(32, y0);
|
|
8858
|
+
v_y[3] = vec_xl(48, y0);
|
|
8859
|
+
y0 += 64;
|
|
8860
|
+
|
|
8861
|
+
q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
|
|
8862
|
+
q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
|
|
8863
|
+
uint8x16_t shifted = vec_sr(v_xh[0], 2);
|
|
8864
|
+
q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8865
|
+
shifted = vec_sr(v_xh[1], 2);
|
|
8866
|
+
q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8867
|
+
|
|
8868
|
+
q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
|
|
8869
|
+
q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
|
|
8870
|
+
q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
|
|
8871
|
+
q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
|
|
8872
|
+
|
|
8873
|
+
int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
|
|
8874
|
+
int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
|
|
8875
|
+
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
8876
|
+
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
8877
|
+
|
|
8878
|
+
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
|
|
8879
|
+
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
|
|
8880
|
+
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
|
|
8881
|
+
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
|
|
8882
|
+
|
|
8883
|
+
scale += 4;
|
|
8884
|
+
|
|
8885
|
+
|
|
8886
|
+
// Load activation quants
|
|
8887
|
+
v_y[0] = vec_xl(0 , y0);
|
|
8888
|
+
v_y[1] = vec_xl(16, y0);
|
|
8889
|
+
v_y[2] = vec_xl(32, y0);
|
|
8890
|
+
v_y[3] = vec_xl(48, y0);
|
|
8891
|
+
y0 += 64;
|
|
8892
|
+
|
|
8893
|
+
shifted = vec_sr(v_xh[0], 4);
|
|
8894
|
+
q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8895
|
+
shifted = vec_sr(v_xh[1], 4);
|
|
8896
|
+
q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8897
|
+
shifted = vec_sr(v_xh[0], 6);
|
|
8898
|
+
q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8899
|
+
shifted = vec_sr(v_xh[1], 6);
|
|
8900
|
+
q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8901
|
+
|
|
8902
|
+
q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
|
|
8903
|
+
q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
|
|
8904
|
+
q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
|
|
8905
|
+
q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
|
|
8906
|
+
|
|
8907
|
+
summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
|
|
8908
|
+
summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
|
|
8909
|
+
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
8910
|
+
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
8911
|
+
|
|
8912
|
+
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
|
|
8913
|
+
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
|
|
8914
|
+
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
|
|
8915
|
+
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
|
|
8916
|
+
|
|
8917
|
+
scale += 4;
|
|
8918
|
+
}
|
|
8919
|
+
|
|
8920
|
+
sum += d_all * y[i].d * (isum - 32 * mins);
|
|
8921
|
+
}
|
|
8922
|
+
|
|
8923
|
+
*s = sum;
|
|
8072
8924
|
#else
|
|
8073
8925
|
|
|
8074
8926
|
int8_t aux8[QK_K];
|
|
@@ -8079,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
8079
8931
|
|
|
8080
8932
|
float sumf = 0;
|
|
8081
8933
|
for (int i = 0; i < nb; ++i) {
|
|
8082
|
-
const uint8_t *
|
|
8083
|
-
const uint8_t *
|
|
8084
|
-
const int8_t *
|
|
8934
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8935
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8936
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8085
8937
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
8086
|
-
int8_t *
|
|
8938
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
8087
8939
|
for (int j = 0; j < QK_K; j += 128) {
|
|
8088
8940
|
for (int l = 0; l < 32; ++l) {
|
|
8089
8941
|
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
@@ -8151,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
|
8151
9003
|
};
|
|
8152
9004
|
#endif
|
|
8153
9005
|
|
|
8154
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float *
|
|
9006
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
8155
9007
|
assert(n % QK_K == 0);
|
|
8156
9008
|
assert(nrc == 1);
|
|
8157
9009
|
UNUSED(nrc);
|
|
@@ -8159,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8159
9011
|
UNUSED(by);
|
|
8160
9012
|
UNUSED(bs);
|
|
8161
9013
|
|
|
8162
|
-
const block_iq2_xxs *
|
|
8163
|
-
const block_q8_K *
|
|
9014
|
+
const block_iq2_xxs * GGML_RESTRICT x = vx;
|
|
9015
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
8164
9016
|
|
|
8165
9017
|
const int nb = n / QK_K;
|
|
8166
9018
|
|
|
@@ -8178,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8178
9030
|
float sumf = 0;
|
|
8179
9031
|
for (int i = 0; i < nb; ++i) {
|
|
8180
9032
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8181
|
-
const uint16_t *
|
|
8182
|
-
const int8_t *
|
|
9033
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9034
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8183
9035
|
float sumf1 = 0, sumf2 = 0;
|
|
8184
9036
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
8185
9037
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
@@ -8215,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8215
9067
|
__m256 accumf = _mm256_setzero_ps();
|
|
8216
9068
|
for (int i = 0; i < nb; ++i) {
|
|
8217
9069
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8218
|
-
const uint16_t *
|
|
8219
|
-
const int8_t *
|
|
9070
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9071
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8220
9072
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
8221
9073
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
8222
9074
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -8256,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8256
9108
|
__m256 accumf = _mm256_setzero_ps();
|
|
8257
9109
|
for (int i = 0; i < nb; ++i) {
|
|
8258
9110
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8259
|
-
const uint16_t *
|
|
8260
|
-
const int8_t *
|
|
9111
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9112
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8261
9113
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
8262
9114
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
8263
9115
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
@@ -8321,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8321
9173
|
vector signed int vsumi2 = v0;
|
|
8322
9174
|
vector signed int vsumi3 = v0;
|
|
8323
9175
|
|
|
8324
|
-
const uint16_t *
|
|
8325
|
-
const int8_t *
|
|
9176
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9177
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8326
9178
|
|
|
8327
9179
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
8328
9180
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -8398,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8398
9250
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
8399
9251
|
for (int i = 0; i < nb; ++i) {
|
|
8400
9252
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8401
|
-
const uint16_t *
|
|
8402
|
-
const int8_t *
|
|
9253
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9254
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8403
9255
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
8404
9256
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
8405
9257
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -8429,7 +9281,57 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8429
9281
|
}
|
|
8430
9282
|
|
|
8431
9283
|
*s = 0.125f * hsum_float_8(accumf);
|
|
8432
|
-
|
|
9284
|
+
//#elif defined(__VXE__) || defined(__VXE2__)
|
|
9285
|
+
// const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
9286
|
+
//
|
|
9287
|
+
// uint32_t aux32[4];
|
|
9288
|
+
// const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
9289
|
+
//
|
|
9290
|
+
// float sumf = 0;
|
|
9291
|
+
//
|
|
9292
|
+
// for (int i = 0; i < nb; ++i) {
|
|
9293
|
+
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9294
|
+
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9295
|
+
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9296
|
+
//
|
|
9297
|
+
// float sumf1 = 0, sumf2 = 0;
|
|
9298
|
+
//
|
|
9299
|
+
// for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
|
|
9300
|
+
// int8x16_t q8b0 = vec_xl( 0, q8);
|
|
9301
|
+
// int8x16_t qb81 = vec_xl(16, q8);
|
|
9302
|
+
// int8x16_t q8b2 = vec_xl(32, q8);
|
|
9303
|
+
// int8x16_t q8b3 = vec_xl(48, q8);
|
|
9304
|
+
// q8 += 64;
|
|
9305
|
+
//
|
|
9306
|
+
// memcpy(aux32, q2, 4 * sizeof(uint32_t));
|
|
9307
|
+
// q2 += 8;
|
|
9308
|
+
//
|
|
9309
|
+
// int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
|
|
9310
|
+
// int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
|
|
9311
|
+
// int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
|
|
9312
|
+
// int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
|
|
9313
|
+
//
|
|
9314
|
+
// int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
|
|
9315
|
+
// int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
|
|
9316
|
+
// int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
|
|
9317
|
+
// int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
|
|
9318
|
+
//
|
|
9319
|
+
// q2u0 = vec_mul(q2u0, q2s0);
|
|
9320
|
+
// q2u1 = vec_mul(q2u1, q2s1);
|
|
9321
|
+
// q2u2 = vec_mul(q2u2, q2s2);
|
|
9322
|
+
// q2u3 = vec_mul(q2u3, q2s3);
|
|
9323
|
+
//
|
|
9324
|
+
// const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
|
|
9325
|
+
// const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
|
|
9326
|
+
//
|
|
9327
|
+
// sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
|
|
9328
|
+
// sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
|
|
9329
|
+
// }
|
|
9330
|
+
//
|
|
9331
|
+
// sumf += d * (sumf1 + sumf2);
|
|
9332
|
+
// }
|
|
9333
|
+
//
|
|
9334
|
+
// *s = 0.25f * sumf;
|
|
8433
9335
|
#else
|
|
8434
9336
|
|
|
8435
9337
|
uint32_t aux32[2];
|
|
@@ -8438,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8438
9340
|
float sumf = 0.f;
|
|
8439
9341
|
for (int i = 0; i < nb; ++i) {
|
|
8440
9342
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8441
|
-
const uint16_t *
|
|
8442
|
-
const int8_t *
|
|
9343
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9344
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8443
9345
|
int32_t bsum = 0;
|
|
8444
9346
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
8445
9347
|
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
@@ -8462,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8462
9364
|
#endif
|
|
8463
9365
|
}
|
|
8464
9366
|
|
|
8465
|
-
void ggml_vec_dot_iq2_xs_q8_K(int n, float *
|
|
9367
|
+
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
8466
9368
|
assert(n % QK_K == 0);
|
|
8467
9369
|
assert(nrc == 1);
|
|
8468
9370
|
UNUSED(nrc);
|
|
@@ -8470,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8470
9372
|
UNUSED(by);
|
|
8471
9373
|
UNUSED(bs);
|
|
8472
9374
|
|
|
8473
|
-
const block_iq2_xs *
|
|
8474
|
-
const block_q8_K *
|
|
9375
|
+
const block_iq2_xs * GGML_RESTRICT x = vx;
|
|
9376
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
8475
9377
|
|
|
8476
9378
|
const int nb = n / QK_K;
|
|
8477
9379
|
|
|
@@ -8488,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8488
9390
|
float sumf = 0;
|
|
8489
9391
|
for (int i = 0; i < nb; ++i) {
|
|
8490
9392
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8491
|
-
const uint16_t *
|
|
8492
|
-
const int8_t *
|
|
9393
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9394
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8493
9395
|
const uint8x8_t scales8 = vld1_u8(x[i].scales);
|
|
8494
9396
|
const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
|
|
8495
9397
|
const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
|
|
@@ -8566,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8566
9468
|
__m256 accumf = _mm256_setzero_ps();
|
|
8567
9469
|
for (int i = 0; i < nb; ++i) {
|
|
8568
9470
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8569
|
-
const uint16_t *
|
|
8570
|
-
const int8_t *
|
|
9471
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9472
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8571
9473
|
|
|
8572
9474
|
memcpy(&aux64, x[i].scales, 8);
|
|
8573
9475
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
|
@@ -8687,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8687
9589
|
__m256 accumf = _mm256_setzero_ps();
|
|
8688
9590
|
for (int i = 0; i < nb; ++i) {
|
|
8689
9591
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8690
|
-
const uint16_t *
|
|
8691
|
-
const int8_t *
|
|
9592
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9593
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8692
9594
|
|
|
8693
9595
|
memcpy(&aux64, x[i].scales, 8);
|
|
8694
9596
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
|
@@ -8842,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8842
9744
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
8843
9745
|
for (int i = 0; i < nb; ++i) {
|
|
8844
9746
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8845
|
-
const uint16_t *
|
|
8846
|
-
const int8_t *
|
|
9747
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9748
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8847
9749
|
|
|
8848
9750
|
memcpy(&aux64, x[i].scales, 8);
|
|
8849
9751
|
__m128i stmp = __lsx_vreplgr2vr_d(aux64);
|
|
@@ -8940,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8940
9842
|
vector signed int vsumi2 = v0;
|
|
8941
9843
|
vector signed int vsumi3 = v0;
|
|
8942
9844
|
|
|
8943
|
-
const uint16_t *
|
|
8944
|
-
const uint8_t *
|
|
8945
|
-
const int8_t *
|
|
9845
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9846
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
9847
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8946
9848
|
|
|
8947
9849
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
8948
9850
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -9012,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9012
9914
|
float sumf = 0.f;
|
|
9013
9915
|
for (int i = 0; i < nb; ++i) {
|
|
9014
9916
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9015
|
-
const uint16_t *
|
|
9016
|
-
const uint8_t *
|
|
9017
|
-
const int8_t *
|
|
9917
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9918
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
9919
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9018
9920
|
int32_t bsum = 0;
|
|
9019
9921
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
9020
9922
|
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
@@ -9047,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9047
9949
|
#endif
|
|
9048
9950
|
}
|
|
9049
9951
|
|
|
9050
|
-
void ggml_vec_dot_iq2_s_q8_K(int n, float *
|
|
9952
|
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
9051
9953
|
assert(n % QK_K == 0);
|
|
9052
9954
|
assert(nrc == 1);
|
|
9053
9955
|
UNUSED(nrc);
|
|
@@ -9055,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9055
9957
|
UNUSED(by);
|
|
9056
9958
|
UNUSED(bs);
|
|
9057
9959
|
|
|
9058
|
-
const block_iq2_s *
|
|
9059
|
-
const block_q8_K *
|
|
9960
|
+
const block_iq2_s * GGML_RESTRICT x = vx;
|
|
9961
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
9060
9962
|
|
|
9061
9963
|
const int nb = n / QK_K;
|
|
9062
9964
|
|
|
@@ -9082,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9082
9984
|
|
|
9083
9985
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9084
9986
|
|
|
9085
|
-
const uint8_t *
|
|
9086
|
-
const uint8_t *
|
|
9087
|
-
const uint16_t *
|
|
9088
|
-
const int8_t *
|
|
9987
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
9988
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
9989
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
9990
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9089
9991
|
|
|
9090
9992
|
int sumi1 = 0, sumi2 = 0;
|
|
9091
9993
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9156,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9156
10058
|
__m256 accumf = _mm256_setzero_ps();
|
|
9157
10059
|
for (int i = 0; i < nb; ++i) {
|
|
9158
10060
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9159
|
-
const uint8_t *
|
|
9160
|
-
const uint8_t *
|
|
9161
|
-
const uint16_t *
|
|
9162
|
-
const int8_t *
|
|
10061
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10062
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10063
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10064
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9163
10065
|
|
|
9164
10066
|
memcpy(&aux64, x[i].scales, 8);
|
|
9165
10067
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
@@ -9229,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9229
10131
|
__m256 accumf = _mm256_setzero_ps();
|
|
9230
10132
|
for (int i = 0; i < nb; ++i) {
|
|
9231
10133
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9232
|
-
const uint8_t *
|
|
9233
|
-
const uint8_t *
|
|
9234
|
-
const uint16_t *
|
|
9235
|
-
const int8_t *
|
|
10134
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10135
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10136
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10137
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9236
10138
|
|
|
9237
10139
|
memcpy(&aux64, x[i].scales, 8);
|
|
9238
10140
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
@@ -9327,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9327
10229
|
vector signed int vsumi2 = v0;
|
|
9328
10230
|
vector signed int vsumi3 = v0;
|
|
9329
10231
|
|
|
9330
|
-
const uint8_t *
|
|
9331
|
-
const uint8_t *
|
|
9332
|
-
const uint16_t *
|
|
9333
|
-
const uint8_t *
|
|
9334
|
-
const int8_t *
|
|
10232
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
10233
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10234
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10235
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
10236
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9335
10237
|
|
|
9336
10238
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
9337
10239
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -9428,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9428
10330
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
9429
10331
|
for (int i = 0; i < nb; ++i) {
|
|
9430
10332
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9431
|
-
const uint8_t *
|
|
9432
|
-
const uint8_t *
|
|
9433
|
-
const uint16_t *
|
|
9434
|
-
const int8_t *
|
|
10333
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10334
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10335
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10336
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9435
10337
|
|
|
9436
10338
|
__m128i tmp1;
|
|
9437
10339
|
memcpy(&aux64, x[i].scales, 8);
|
|
@@ -9525,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
9525
10427
|
|
|
9526
10428
|
}
|
|
9527
10429
|
|
|
9528
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float *
|
|
10430
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
9529
10431
|
assert(n % QK_K == 0);
|
|
9530
10432
|
assert(nrc == 1);
|
|
9531
10433
|
UNUSED(nrc);
|
|
@@ -9533,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9533
10435
|
UNUSED(by);
|
|
9534
10436
|
UNUSED(bs);
|
|
9535
10437
|
|
|
9536
|
-
const block_iq3_xxs *
|
|
9537
|
-
const block_q8_K *
|
|
10438
|
+
const block_iq3_xxs * GGML_RESTRICT x = vx;
|
|
10439
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
9538
10440
|
|
|
9539
10441
|
const int nb = n / QK_K;
|
|
9540
10442
|
|
|
@@ -9550,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9550
10452
|
float sumf = 0;
|
|
9551
10453
|
for (int i = 0; i < nb; ++i) {
|
|
9552
10454
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9553
|
-
const uint8_t *
|
|
9554
|
-
const uint8_t *
|
|
9555
|
-
const int8_t *
|
|
10455
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10456
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10457
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9556
10458
|
float sumf1 = 0, sumf2 = 0;
|
|
9557
10459
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
9558
10460
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
@@ -9588,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9588
10490
|
__m256 accumf = _mm256_setzero_ps();
|
|
9589
10491
|
for (int i = 0; i < nb; ++i) {
|
|
9590
10492
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9591
|
-
const uint8_t *
|
|
9592
|
-
const uint8_t *
|
|
9593
|
-
const int8_t *
|
|
10493
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10494
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10495
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9594
10496
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
9595
10497
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
9596
10498
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9633,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9633
10535
|
__m256 accumf = _mm256_setzero_ps();
|
|
9634
10536
|
for (int i = 0; i < nb; ++i) {
|
|
9635
10537
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9636
|
-
const uint8_t *
|
|
9637
|
-
const uint8_t *
|
|
9638
|
-
const int8_t *
|
|
10538
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10539
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10540
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9639
10541
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
9640
10542
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
9641
10543
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
@@ -9702,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9702
10604
|
vector signed int vsumi2 = v0;
|
|
9703
10605
|
vector signed int vsumi3 = v0;
|
|
9704
10606
|
|
|
9705
|
-
const uint8_t *
|
|
9706
|
-
const uint32_t *
|
|
9707
|
-
const int8_t *
|
|
10607
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10608
|
+
const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
|
10609
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9708
10610
|
|
|
9709
10611
|
#pragma GCC unroll 1
|
|
9710
10612
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
@@ -9776,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9776
10678
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
9777
10679
|
for (int i = 0; i < nb; ++i) {
|
|
9778
10680
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9779
|
-
const uint8_t *
|
|
9780
|
-
const uint8_t *
|
|
9781
|
-
const int8_t *
|
|
10681
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10682
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10683
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9782
10684
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
9783
10685
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
9784
10686
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9821,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9821
10723
|
float sumf = 0.f;
|
|
9822
10724
|
for (int i = 0; i < nb; ++i) {
|
|
9823
10725
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9824
|
-
const uint8_t *
|
|
9825
|
-
const uint8_t *
|
|
9826
|
-
const int8_t *
|
|
10726
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10727
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10728
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9827
10729
|
int32_t bsum = 0;
|
|
9828
10730
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
9829
10731
|
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
@@ -9848,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9848
10750
|
#endif
|
|
9849
10751
|
}
|
|
9850
10752
|
|
|
9851
|
-
void ggml_vec_dot_iq3_s_q8_K (int n, float *
|
|
10753
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
9852
10754
|
assert(n % QK_K == 0);
|
|
9853
10755
|
assert(nrc == 1);
|
|
9854
10756
|
UNUSED(nrc);
|
|
@@ -9856,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9856
10758
|
UNUSED(by);
|
|
9857
10759
|
UNUSED(bs);
|
|
9858
10760
|
|
|
9859
|
-
const block_iq3_s *
|
|
9860
|
-
const block_q8_K *
|
|
10761
|
+
const block_iq3_s * GGML_RESTRICT x = vx;
|
|
10762
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
9861
10763
|
|
|
9862
10764
|
const int nb = n / QK_K;
|
|
9863
10765
|
|
|
@@ -9894,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9894
10796
|
float sumf = 0;
|
|
9895
10797
|
for (int i = 0; i < nb; ++i) {
|
|
9896
10798
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9897
|
-
const uint8_t *
|
|
9898
|
-
const uint8_t *
|
|
9899
|
-
const uint16_t *
|
|
9900
|
-
const int8_t *
|
|
10799
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10800
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10801
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
10802
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9901
10803
|
|
|
9902
10804
|
memcpy(scales32, x[i].scales, 4);
|
|
9903
10805
|
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
|
|
@@ -9976,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9976
10878
|
__m256 accumf = _mm256_setzero_ps();
|
|
9977
10879
|
for (int i = 0; i < nb; ++i) {
|
|
9978
10880
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9979
|
-
const uint8_t *
|
|
9980
|
-
const uint8_t *
|
|
9981
|
-
const uint16_t *
|
|
9982
|
-
const int8_t *
|
|
10881
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10882
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10883
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
10884
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9983
10885
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
9984
10886
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
9985
10887
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -10061,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
10061
10963
|
__m256 accumf = _mm256_setzero_ps();
|
|
10062
10964
|
for (int i = 0; i < nb; ++i) {
|
|
10063
10965
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10064
|
-
const uint8_t *
|
|
10065
|
-
const uint8_t *
|
|
10066
|
-
const uint16_t *
|
|
10067
|
-
const int8_t *
|
|
10966
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10967
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10968
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
10969
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
10068
10970
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
10069
10971
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
10070
10972
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
@@ -10162,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
10162
11064
|
vector float vyd = vec_splats(y[i].d);
|
|
10163
11065
|
vector float vd = vec_mul(vxd, vyd);
|
|
10164
11066
|
|
|
10165
|
-
const uint8_t *
|
|
10166
|
-
const uint8_t *
|
|
10167
|
-
const uint16_t *
|
|
10168
|
-
const uint8_t *
|
|
10169
|
-
const int8_t *
|
|
11067
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
11068
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
11069
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
|
|
11070
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
11071
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
10170
11072
|
|
|
10171
11073
|
vector signed int vsumi0 = v0;
|
|
10172
11074
|
vector signed int vsumi1 = v0;
|
|
@@ -10273,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
10273
11175
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
10274
11176
|
for (int i = 0; i < nb; ++i) {
|
|
10275
11177
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10276
|
-
const uint8_t *
|
|
10277
|
-
const uint8_t *
|
|
10278
|
-
const uint16_t *
|
|
10279
|
-
const int8_t *
|
|
11178
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
11179
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
11180
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
11181
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
10280
11182
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
10281
11183
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
10282
11184
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -10334,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
10334
11236
|
float sumf = 0.f;
|
|
10335
11237
|
for (int i = 0; i < nb; ++i) {
|
|
10336
11238
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
10337
|
-
const uint8_t *
|
|
10338
|
-
const uint8_t *
|
|
10339
|
-
const uint8_t *
|
|
10340
|
-
const int8_t *
|
|
11239
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
11240
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
11241
|
+
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
11242
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
10341
11243
|
int32_t bsum = 0;
|
|
10342
11244
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
10343
11245
|
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
@@ -10389,7 +11291,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
|
10389
11291
|
}
|
|
10390
11292
|
#endif
|
|
10391
11293
|
|
|
10392
|
-
void ggml_vec_dot_iq1_s_q8_K (int n, float *
|
|
11294
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10393
11295
|
assert(n % QK_K == 0);
|
|
10394
11296
|
assert(nrc == 1);
|
|
10395
11297
|
UNUSED(nrc);
|
|
@@ -10397,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10397
11299
|
UNUSED(by);
|
|
10398
11300
|
UNUSED(bs);
|
|
10399
11301
|
|
|
10400
|
-
const block_iq1_s *
|
|
10401
|
-
const block_q8_K *
|
|
11302
|
+
const block_iq1_s * GGML_RESTRICT x = vx;
|
|
11303
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
10402
11304
|
|
|
10403
11305
|
const int nb = n / QK_K;
|
|
10404
11306
|
|
|
@@ -10460,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10460
11362
|
__m256i sumi = _mm256_setzero_si256();
|
|
10461
11363
|
int sumi1 = 0;
|
|
10462
11364
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
11365
|
+
#ifdef __BMI2__
|
|
11366
|
+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
|
|
11367
|
+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
|
|
11368
|
+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
|
11369
|
+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
|
11370
|
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
|
11371
|
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
|
11372
|
+
#else
|
|
10463
11373
|
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
|
10464
11374
|
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
|
10465
11375
|
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
|
10466
11376
|
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
|
11377
|
+
#endif
|
|
10467
11378
|
qs += 8;
|
|
10468
11379
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10469
11380
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
@@ -10556,10 +11467,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10556
11467
|
vector signed int vsumi3 = vec_splats((int32_t)0);
|
|
10557
11468
|
vector signed int vsumi8 = vec_splats((int32_t)0);
|
|
10558
11469
|
|
|
10559
|
-
const uint8_t *
|
|
10560
|
-
const uint16_t *
|
|
10561
|
-
const int8_t *
|
|
10562
|
-
const int16_t *
|
|
11470
|
+
const uint8_t * GGML_RESTRICT q1 = x[i].qs;
|
|
11471
|
+
const uint16_t * GGML_RESTRICT qh = x[i].qh;
|
|
11472
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
11473
|
+
const int16_t * GGML_RESTRICT qs = y[i].bsums;
|
|
10563
11474
|
|
|
10564
11475
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
10565
11476
|
__builtin_prefetch(q1, 0, 1);
|
|
@@ -10720,7 +11631,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10720
11631
|
#endif
|
|
10721
11632
|
}
|
|
10722
11633
|
|
|
10723
|
-
void ggml_vec_dot_iq1_m_q8_K (int n, float *
|
|
11634
|
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10724
11635
|
assert(n % QK_K == 0);
|
|
10725
11636
|
assert(nrc == 1);
|
|
10726
11637
|
UNUSED(nrc);
|
|
@@ -10728,8 +11639,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10728
11639
|
UNUSED(by);
|
|
10729
11640
|
UNUSED(bs);
|
|
10730
11641
|
|
|
10731
|
-
const block_iq1_m *
|
|
10732
|
-
const block_q8_K *
|
|
11642
|
+
const block_iq1_m * GGML_RESTRICT x = vx;
|
|
11643
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
10733
11644
|
|
|
10734
11645
|
const int nb = n / QK_K;
|
|
10735
11646
|
|
|
@@ -10809,6 +11720,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10809
11720
|
|
|
10810
11721
|
const __m256i mask = _mm256_set1_epi16(0x7);
|
|
10811
11722
|
const __m256i mone = _mm256_set1_epi16(1);
|
|
11723
|
+
const __m256i mone8 = _mm256_set1_epi8(1);
|
|
11724
|
+
const __m256i mtwo8 = _mm256_set1_epi8(2);
|
|
11725
|
+
// VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
|
|
11726
|
+
const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
|
|
10812
11727
|
|
|
10813
11728
|
__m256 accum1 = _mm256_setzero_ps();
|
|
10814
11729
|
__m256 accum2 = _mm256_setzero_ps();
|
|
@@ -10820,10 +11735,33 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10820
11735
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
10821
11736
|
|
|
10822
11737
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
11738
|
+
// Extract 3-bit scales (16 values)
|
|
11739
|
+
__m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
|
|
11740
|
+
scales = _mm256_srlv_epi64(scales, scales_shift);
|
|
11741
|
+
scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
|
|
11742
|
+
|
|
11743
|
+
// Indices to repeat each scale 8 times.
|
|
11744
|
+
__m256i scales_idx1 = _mm256_set1_epi16(0x0100);
|
|
11745
|
+
__m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
|
|
10823
11746
|
|
|
10824
11747
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
10825
11748
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
10826
11749
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
11750
|
+
#ifdef __BMI2__
|
|
11751
|
+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
|
|
11752
|
+
| _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
|
|
11753
|
+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
|
|
11754
|
+
| _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
|
|
11755
|
+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
|
11756
|
+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
|
11757
|
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
|
11758
|
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
|
11759
|
+
|
|
11760
|
+
// Convert signs to bytes 0x81 (negative) or 0x01 (positive)
|
|
11761
|
+
const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
|
|
11762
|
+
const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
|
|
11763
|
+
const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
|
|
11764
|
+
#else
|
|
10827
11765
|
const __m256i q1b_1 = _mm256_set_epi64x(
|
|
10828
11766
|
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
|
10829
11767
|
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
|
@@ -10832,11 +11770,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10832
11770
|
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
|
10833
11771
|
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
|
10834
11772
|
);
|
|
10835
|
-
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10836
|
-
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10837
|
-
|
|
10838
|
-
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
|
10839
|
-
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
|
10840
11773
|
|
|
10841
11774
|
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
10842
11775
|
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
@@ -10846,15 +11779,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10846
11779
|
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
10847
11780
|
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
10848
11781
|
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
11782
|
+
#endif
|
|
11783
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
11784
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10849
11785
|
|
|
10850
|
-
const __m256i
|
|
10851
|
-
const __m256i
|
|
11786
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
|
11787
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
|
11788
|
+
const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
|
|
11789
|
+
const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
|
|
11790
|
+
|
|
11791
|
+
__m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
|
|
11792
|
+
__m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
|
|
10852
11793
|
|
|
10853
|
-
|
|
10854
|
-
|
|
11794
|
+
scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
|
|
11795
|
+
scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
|
|
10855
11796
|
|
|
10856
|
-
scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
|
|
10857
|
-
scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
|
|
10858
11797
|
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
|
10859
11798
|
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
|
10860
11799
|
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
|
@@ -11010,7 +11949,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
11010
11949
|
#endif
|
|
11011
11950
|
}
|
|
11012
11951
|
|
|
11013
|
-
void ggml_vec_dot_iq4_nl_q8_0(int n, float *
|
|
11952
|
+
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
11014
11953
|
assert(nrc == 1);
|
|
11015
11954
|
UNUSED(nrc);
|
|
11016
11955
|
UNUSED(bx);
|
|
@@ -11019,8 +11958,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
11019
11958
|
assert(n % QK4_NL == 0);
|
|
11020
11959
|
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
|
11021
11960
|
|
|
11022
|
-
const block_iq4_nl *
|
|
11023
|
-
const block_q8_0 *
|
|
11961
|
+
const block_iq4_nl * GGML_RESTRICT x = vx;
|
|
11962
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
11024
11963
|
|
|
11025
11964
|
const int nb = n / QK4_NL;
|
|
11026
11965
|
|
|
@@ -11190,6 +12129,27 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
11190
12129
|
|
|
11191
12130
|
sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
|
|
11192
12131
|
|
|
12132
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
12133
|
+
const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
|
|
12134
|
+
const uint8x16_t v_m = vec_splat_u8(0x0F);
|
|
12135
|
+
|
|
12136
|
+
for (; ib < nb; ++ib) {
|
|
12137
|
+
const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
|
|
12138
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
12139
|
+
|
|
12140
|
+
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
|
12141
|
+
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
|
12142
|
+
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
|
12143
|
+
|
|
12144
|
+
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
|
|
12145
|
+
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
|
|
12146
|
+
|
|
12147
|
+
const int8x16_t v_yl = vec_xl(0 , y0->qs);
|
|
12148
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
|
12149
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
12150
|
+
|
|
12151
|
+
sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
|
12152
|
+
}
|
|
11193
12153
|
#endif
|
|
11194
12154
|
for (; ib < nb; ++ib) {
|
|
11195
12155
|
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
|
@@ -11203,7 +12163,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
11203
12163
|
*s = sumf;
|
|
11204
12164
|
}
|
|
11205
12165
|
|
|
11206
|
-
void ggml_vec_dot_iq4_xs_q8_K(int n, float *
|
|
12166
|
+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
11207
12167
|
assert(nrc == 1);
|
|
11208
12168
|
UNUSED(nrc);
|
|
11209
12169
|
UNUSED(bx);
|
|
@@ -11211,8 +12171,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
11211
12171
|
UNUSED(bs);
|
|
11212
12172
|
assert(n % QK_K == 0);
|
|
11213
12173
|
|
|
11214
|
-
const block_iq4_xs *
|
|
11215
|
-
const block_q8_K *
|
|
12174
|
+
const block_iq4_xs * GGML_RESTRICT x = vx;
|
|
12175
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
11216
12176
|
|
|
11217
12177
|
const int nb = n / QK_K;
|
|
11218
12178
|
|
|
@@ -11369,9 +12329,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
11369
12329
|
|
|
11370
12330
|
uint16_t h = x[ibl].scales_h;
|
|
11371
12331
|
|
|
11372
|
-
const uint8_t *
|
|
11373
|
-
const uint8_t *
|
|
11374
|
-
const int8_t *
|
|
12332
|
+
const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
|
|
12333
|
+
const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
|
|
12334
|
+
const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
|
|
11375
12335
|
|
|
11376
12336
|
for (int ib = 0; ib < QK_K/64; ib ++ ) {
|
|
11377
12337
|
__builtin_prefetch(q4, 0, 1);
|
|
@@ -11468,6 +12428,56 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
11468
12428
|
}
|
|
11469
12429
|
|
|
11470
12430
|
*s = hsum_float_8(accum);
|
|
12431
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
12432
|
+
const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
|
|
12433
|
+
const uint8x16_t v_m = vec_splat_u8(0x0F);
|
|
12434
|
+
|
|
12435
|
+
float sumf = 0;
|
|
12436
|
+
|
|
12437
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
12438
|
+
const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
|
|
12439
|
+
const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
|
|
12440
|
+
|
|
12441
|
+
uint16_t h = x[ibl].scales_h;
|
|
12442
|
+
|
|
12443
|
+
int sumi1 = 0, sumi2 = 0;
|
|
12444
|
+
for (int ib = 0; ib < QK_K/64; ++ib) {
|
|
12445
|
+
const uint8x16_t v_x0 = vec_xl(0 , q4);
|
|
12446
|
+
const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
|
|
12447
|
+
q4 += 32;
|
|
12448
|
+
|
|
12449
|
+
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
|
12450
|
+
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
|
12451
|
+
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
|
12452
|
+
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
|
12453
|
+
|
|
12454
|
+
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
|
|
12455
|
+
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
|
|
12456
|
+
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
|
|
12457
|
+
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
|
|
12458
|
+
|
|
12459
|
+
const int8x16_t v_y0 = vec_xl( 0, q8);
|
|
12460
|
+
const int8x16_t v_y1 = vec_xl(16, q8);
|
|
12461
|
+
const int8x16_t v_y2 = vec_xl(32, q8);
|
|
12462
|
+
const int8x16_t v_y3 = vec_xl(48, q8);
|
|
12463
|
+
q8 += 64;
|
|
12464
|
+
|
|
12465
|
+
int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
|
|
12466
|
+
int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
|
|
12467
|
+
|
|
12468
|
+
int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
|
|
12469
|
+
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
|
12470
|
+
|
|
12471
|
+
h >>= 4;
|
|
12472
|
+
|
|
12473
|
+
sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
|
|
12474
|
+
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
|
|
12475
|
+
}
|
|
12476
|
+
|
|
12477
|
+
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
|
12478
|
+
}
|
|
12479
|
+
|
|
12480
|
+
*s = sumf;
|
|
11471
12481
|
|
|
11472
12482
|
#else
|
|
11473
12483
|
float sumf = 0;
|
|
@@ -11506,12 +12516,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
11506
12516
|
|
|
11507
12517
|
// ============================ 4-bit non-linear quants
|
|
11508
12518
|
|
|
11509
|
-
void quantize_row_iq4_nl(const float *
|
|
12519
|
+
void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
11510
12520
|
assert(k % QK4_NL == 0);
|
|
11511
12521
|
quantize_row_iq4_nl_ref(x, y, k);
|
|
11512
12522
|
}
|
|
11513
12523
|
|
|
11514
|
-
void quantize_row_iq4_xs(const float *
|
|
12524
|
+
void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
11515
12525
|
assert(k % QK_K == 0);
|
|
11516
12526
|
quantize_iq4_xs(x, y, 1, k, NULL);
|
|
11517
12527
|
}
|