@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -501,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
|
|
|
501
501
|
}
|
|
502
502
|
|
|
503
503
|
static __m256i lasx_extu8_16(__m128i a) {
|
|
504
|
-
|
|
505
|
-
__m128i vlo = __lsx_vilvl_b(zero, a);
|
|
506
|
-
__m128i vhi = __lsx_vilvh_b(zero, a);
|
|
507
|
-
return lasx_set_q(vhi, vlo);
|
|
504
|
+
return __lasx_vext2xv_hu_bu(____m256i(a));
|
|
508
505
|
}
|
|
509
506
|
|
|
510
507
|
static __m256i lasx_ext8_16(__m128i a) {
|
|
511
|
-
|
|
512
|
-
__m128i vlo = __lsx_vilvl_b(sign, a);
|
|
513
|
-
__m128i vhi = __lsx_vilvh_b(sign, a);
|
|
514
|
-
return lasx_set_q(vhi, vlo);
|
|
508
|
+
return __lasx_vext2xv_h_b(____m256i(a));
|
|
515
509
|
}
|
|
516
510
|
|
|
517
511
|
static __m256i lasx_ext16_32(__m128i a) {
|
|
518
|
-
|
|
519
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
|
|
520
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
|
|
521
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
|
|
522
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
|
|
523
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
|
|
524
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
|
|
525
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
|
|
526
|
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
|
|
527
|
-
return tmp1;
|
|
512
|
+
return __lasx_vext2xv_w_h(____m256i(a));
|
|
528
513
|
}
|
|
529
514
|
|
|
530
515
|
static __m128i lasx_extracti128( __m256i a, int pos) {
|
|
@@ -577,6 +562,41 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) {
|
|
|
577
562
|
return __lasx_xvpickev_b(tmp1, tmp);
|
|
578
563
|
}
|
|
579
564
|
|
|
565
|
+
static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) {
|
|
566
|
+
__m256i tmp1, tmp2;
|
|
567
|
+
tmp1 = __lasx_xvmulwev_h_b(a, b);
|
|
568
|
+
tmp2 = __lasx_xvmulwod_h_b(a, b);
|
|
569
|
+
return __lasx_xvadd_h(tmp1, tmp2);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) {
|
|
573
|
+
switch (b) {
|
|
574
|
+
case 0: return __lasx_xvrepl128vei_h(a, 0);
|
|
575
|
+
case 1: return __lasx_xvrepl128vei_h(a, 1);
|
|
576
|
+
case 2: return __lasx_xvrepl128vei_h(a, 2);
|
|
577
|
+
case 3: return __lasx_xvrepl128vei_h(a, 3);
|
|
578
|
+
case 4: return __lasx_xvrepl128vei_h(a, 4);
|
|
579
|
+
case 5: return __lasx_xvrepl128vei_h(a, 5);
|
|
580
|
+
case 6: return __lasx_xvrepl128vei_h(a, 6);
|
|
581
|
+
case 7: return __lasx_xvrepl128vei_h(a, 7);
|
|
582
|
+
default: __builtin_unreachable();
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
|
|
587
|
+
switch (b) {
|
|
588
|
+
case 0: return __lasx_xvandi_b(a, 1 << 0);
|
|
589
|
+
case 1: return __lasx_xvandi_b(a, 1 << 1);
|
|
590
|
+
case 2: return __lasx_xvandi_b(a, 1 << 2);
|
|
591
|
+
case 3: return __lasx_xvandi_b(a, 1 << 3);
|
|
592
|
+
case 4: return __lasx_xvandi_b(a, 1 << 4);
|
|
593
|
+
case 5: return __lasx_xvandi_b(a, 1 << 5);
|
|
594
|
+
case 6: return __lasx_xvandi_b(a, 1 << 6);
|
|
595
|
+
case 7: return __lasx_xvandi_b(a, 1 << 7);
|
|
596
|
+
default: __builtin_unreachable();
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
580
600
|
// multiply int8_t, add results pairwise twice
|
|
581
601
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
582
602
|
// Get absolute values of x vectors
|
|
@@ -592,12 +612,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
|
592
612
|
// horizontally add 8 floats
|
|
593
613
|
static inline float hsum_float_8(const __m256 x) {
|
|
594
614
|
__m128 res = lasx_extractf128(x, 1);
|
|
595
|
-
ft_union tmp;
|
|
596
615
|
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
|
|
597
616
|
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
|
|
598
617
|
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
|
|
599
|
-
|
|
600
|
-
return tmp.f;
|
|
618
|
+
return ((v4f32)res)[0];
|
|
601
619
|
}
|
|
602
620
|
|
|
603
621
|
// horizontally add 8 int32_t
|
|
@@ -673,13 +691,8 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
|
|
|
673
691
|
|
|
674
692
|
// multiply int8_t, add results pairwise twice and return as float vector
|
|
675
693
|
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
const __m256i ax = __lasx_xvsigncov_b(x, x);
|
|
679
|
-
// Sign the values of the y vectors
|
|
680
|
-
const __m256i sy = __lasx_xvsigncov_b(x, y);
|
|
681
|
-
|
|
682
|
-
return mul_sum_us8_pairs_float(ax, sy);
|
|
694
|
+
const __m256i dot = lasx_madd_h_b(x, y);
|
|
695
|
+
return sum_i16_pairs_float(dot);
|
|
683
696
|
}
|
|
684
697
|
|
|
685
698
|
static inline __m128i packNibbles( __m256i bytes ) {
|
|
@@ -706,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
|
|
|
706
719
|
}
|
|
707
720
|
#endif //__loongarch_asx
|
|
708
721
|
|
|
709
|
-
void quantize_row_q4_0(const float *
|
|
722
|
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
710
723
|
quantize_row_q4_0_ref(x, y, k);
|
|
711
724
|
}
|
|
712
725
|
|
|
713
|
-
void quantize_row_q4_1(const float *
|
|
726
|
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
714
727
|
quantize_row_q4_1_ref(x, y, k);
|
|
715
728
|
}
|
|
716
729
|
|
|
717
|
-
void quantize_row_q5_0(const float *
|
|
730
|
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
718
731
|
quantize_row_q5_0_ref(x, y, k);
|
|
719
732
|
}
|
|
720
733
|
|
|
721
|
-
void quantize_row_q5_1(const float *
|
|
734
|
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
722
735
|
quantize_row_q5_1_ref(x, y, k);
|
|
723
736
|
}
|
|
724
737
|
|
|
725
|
-
void quantize_row_q8_0(const float *
|
|
738
|
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
726
739
|
assert(QK8_0 == 32);
|
|
727
740
|
assert(k % QK8_0 == 0);
|
|
728
741
|
const int nb = k / QK8_0;
|
|
729
742
|
|
|
730
|
-
block_q8_0 *
|
|
743
|
+
block_q8_0 * GGML_RESTRICT y = vy;
|
|
731
744
|
|
|
732
745
|
#if defined(__ARM_NEON)
|
|
733
746
|
for (int i = 0; i < nb; i++) {
|
|
@@ -759,7 +772,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
759
772
|
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
|
760
773
|
}
|
|
761
774
|
}
|
|
762
|
-
#elif defined
|
|
775
|
+
#elif defined __wasm_simd128__
|
|
763
776
|
for (int i = 0; i < nb; i++) {
|
|
764
777
|
v128_t srcv [8];
|
|
765
778
|
v128_t asrcv[8];
|
|
@@ -939,7 +952,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
939
952
|
|
|
940
953
|
#elif defined(__loongarch_asx)
|
|
941
954
|
for (int i = 0; i < nb; i++) {
|
|
942
|
-
ft_union fi;
|
|
943
955
|
__m256 v0 = (__m256)__lasx_xvld( x , 0);
|
|
944
956
|
__m256 v1 = (__m256)__lasx_xvld( x , 32);
|
|
945
957
|
__m256 v2 = (__m256)__lasx_xvld( x , 64);
|
|
@@ -957,8 +969,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
957
969
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
|
958
970
|
__m128 tmp = max4;
|
|
959
971
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
|
|
960
|
-
|
|
961
|
-
const float max_scalar = fi.f;
|
|
972
|
+
const float max_scalar = ((v4f32)max4)[0];
|
|
962
973
|
|
|
963
974
|
// Quantize these floats
|
|
964
975
|
const float d = max_scalar / 127.f;
|
|
@@ -1000,6 +1011,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1000
1011
|
__lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
|
|
1001
1012
|
|
|
1002
1013
|
}
|
|
1014
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
1015
|
+
for (int i = 0; i < nb; i++) {
|
|
1016
|
+
__vector float srcv [8];
|
|
1017
|
+
__vector float asrcv[8];
|
|
1018
|
+
__vector float amaxv[8];
|
|
1019
|
+
|
|
1020
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
1021
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
1022
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
|
1023
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
|
1024
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
|
1025
|
+
|
|
1026
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
|
1027
|
+
vec_extract(amaxv[0], 1)),
|
|
1028
|
+
MAX(vec_extract(amaxv[0], 2),
|
|
1029
|
+
vec_extract(amaxv[0], 3)));
|
|
1030
|
+
|
|
1031
|
+
const float d = amax / ((1 << 7) - 1);
|
|
1032
|
+
const float id = d ? 1.0f / d : 0.0f;
|
|
1033
|
+
|
|
1034
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
|
1035
|
+
|
|
1036
|
+
for (int j = 0; j < 8; j++) {
|
|
1037
|
+
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
|
1038
|
+
const __vector int32_t vi = vec_signed(v);
|
|
1039
|
+
|
|
1040
|
+
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
1041
|
+
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
1042
|
+
y[i].qs[4*j + 2] = vec_extract(vi, 2);
|
|
1043
|
+
y[i].qs[4*j + 3] = vec_extract(vi, 3);
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1003
1046
|
#else
|
|
1004
1047
|
GGML_UNUSED(nb);
|
|
1005
1048
|
// scalar
|
|
@@ -1007,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1007
1050
|
#endif
|
|
1008
1051
|
}
|
|
1009
1052
|
|
|
1010
|
-
void quantize_row_q8_1(const float *
|
|
1053
|
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1011
1054
|
assert(k % QK8_1 == 0);
|
|
1012
1055
|
const int nb = k / QK8_1;
|
|
1013
1056
|
|
|
1014
|
-
block_q8_1 *
|
|
1057
|
+
block_q8_1 * GGML_RESTRICT y = vy;
|
|
1015
1058
|
|
|
1016
1059
|
#if defined(__ARM_NEON)
|
|
1017
1060
|
for (int i = 0; i < nb; i++) {
|
|
@@ -1049,7 +1092,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1049
1092
|
|
|
1050
1093
|
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
|
|
1051
1094
|
}
|
|
1052
|
-
#elif defined
|
|
1095
|
+
#elif defined __wasm_simd128__
|
|
1053
1096
|
for (int i = 0; i < nb; i++) {
|
|
1054
1097
|
v128_t srcv [8];
|
|
1055
1098
|
v128_t asrcv[8];
|
|
@@ -1263,7 +1306,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1263
1306
|
|
|
1264
1307
|
#elif defined(__loongarch_asx)
|
|
1265
1308
|
for (int i = 0; i < nb; i++) {
|
|
1266
|
-
ft_union ft;
|
|
1267
1309
|
__m256 v0 = (__m256)__lasx_xvld( x , 0 );
|
|
1268
1310
|
__m256 v1 = (__m256)__lasx_xvld( x , 32 );
|
|
1269
1311
|
__m256 v2 = (__m256)__lasx_xvld( x , 64 );
|
|
@@ -1281,8 +1323,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1281
1323
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
|
1282
1324
|
__m128 tmp = max4;
|
|
1283
1325
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
|
1284
|
-
|
|
1285
|
-
const float max_scalar = ft.f;
|
|
1326
|
+
const float max_scalar = ((v4f32)max4)[0];
|
|
1286
1327
|
|
|
1287
1328
|
// Quantize these floats
|
|
1288
1329
|
const float d = max_scalar / 127.f;
|
|
@@ -1328,6 +1369,44 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
|
1328
1369
|
__lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
|
|
1329
1370
|
__lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
|
|
1330
1371
|
}
|
|
1372
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
1373
|
+
for (int i = 0; i < nb; i++) {
|
|
1374
|
+
__vector float srcv [8];
|
|
1375
|
+
__vector float asrcv[8];
|
|
1376
|
+
__vector float amaxv[8];
|
|
1377
|
+
|
|
1378
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
1379
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
1380
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
|
1381
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
|
1382
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
|
1383
|
+
|
|
1384
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
|
1385
|
+
vec_extract(amaxv[0], 1)),
|
|
1386
|
+
MAX(vec_extract(amaxv[0], 2),
|
|
1387
|
+
vec_extract(amaxv[0], 3)));
|
|
1388
|
+
|
|
1389
|
+
const float d = amax / ((1 << 7) - 1);
|
|
1390
|
+
const float id = d ? 1.0f / d : 0.0f;
|
|
1391
|
+
|
|
1392
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
|
1393
|
+
|
|
1394
|
+
__vector int32_t acc = vec_splats(0);
|
|
1395
|
+
|
|
1396
|
+
for (int j = 0; j < 8; j++) {
|
|
1397
|
+
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
|
1398
|
+
const __vector int32_t vi = vec_signed(v);
|
|
1399
|
+
|
|
1400
|
+
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
1401
|
+
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
1402
|
+
y[i].qs[4*j + 2] = vec_extract(vi, 2);
|
|
1403
|
+
y[i].qs[4*j + 3] = vec_extract(vi, 3);
|
|
1404
|
+
|
|
1405
|
+
acc = vec_add(acc, vi);
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
|
|
1409
|
+
}
|
|
1331
1410
|
#else
|
|
1332
1411
|
GGML_UNUSED(nb);
|
|
1333
1412
|
// scalar
|
|
@@ -1349,8 +1428,8 @@ static inline int nearest_int(float fval) {
|
|
|
1349
1428
|
return (i & 0x007fffff) - 0x00400000;
|
|
1350
1429
|
}
|
|
1351
1430
|
|
|
1352
|
-
static float make_qx_quants(int n, int nmax, const float *
|
|
1353
|
-
const float *
|
|
1431
|
+
static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
|
|
1432
|
+
const float * GGML_RESTRICT qw) {
|
|
1354
1433
|
float max = 0;
|
|
1355
1434
|
float amax = 0;
|
|
1356
1435
|
for (int i = 0; i < n; ++i) {
|
|
@@ -1418,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
1418
1497
|
return scale;
|
|
1419
1498
|
}
|
|
1420
1499
|
|
|
1421
|
-
static float make_q3_quants(int n, int nmax, const float *
|
|
1500
|
+
static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
|
|
1422
1501
|
float max = 0;
|
|
1423
1502
|
float amax = 0;
|
|
1424
1503
|
for (int i = 0; i < n; ++i) {
|
|
@@ -1477,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
1477
1556
|
return 1/iscale;
|
|
1478
1557
|
}
|
|
1479
1558
|
|
|
1480
|
-
static float make_qkx1_quants(int n, int nmax, const float *
|
|
1559
|
+
static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
|
|
1481
1560
|
int ntry, float alpha) {
|
|
1482
1561
|
float min = x[0];
|
|
1483
1562
|
float max = x[0];
|
|
@@ -1520,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
|
1520
1599
|
return scale;
|
|
1521
1600
|
}
|
|
1522
1601
|
|
|
1523
|
-
static float make_qkx2_quants(int n, int nmax, const float *
|
|
1524
|
-
uint8_t *
|
|
1602
|
+
static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
|
1603
|
+
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
|
1525
1604
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
1526
1605
|
float min = x[0];
|
|
1527
1606
|
float max = x[0];
|
|
@@ -1601,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|
|
1601
1680
|
return scale;
|
|
1602
1681
|
}
|
|
1603
1682
|
|
|
1604
|
-
static inline void get_scale_min_k4(int j, const uint8_t *
|
|
1683
|
+
static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
|
|
1605
1684
|
if (j < 4) {
|
|
1606
1685
|
*d = q[j] & 63; *m = q[j + 4] & 63;
|
|
1607
1686
|
} else {
|
|
@@ -1612,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
|
1612
1691
|
|
|
1613
1692
|
//========================- 2-bit (de)-quantization
|
|
1614
1693
|
|
|
1615
|
-
void quantize_row_q2_K(const float *
|
|
1694
|
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1616
1695
|
quantize_row_q2_K_ref(x, vy, k);
|
|
1617
1696
|
}
|
|
1618
1697
|
|
|
1619
1698
|
//========================= 3-bit (de)-quantization
|
|
1620
1699
|
|
|
1621
|
-
void quantize_row_q3_K(const float *
|
|
1700
|
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1622
1701
|
quantize_row_q3_K_ref(x, vy, k);
|
|
1623
1702
|
}
|
|
1624
1703
|
|
|
1625
1704
|
// ====================== 4-bit (de)-quantization
|
|
1626
1705
|
|
|
1627
|
-
void quantize_row_q4_K(const float *
|
|
1706
|
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1628
1707
|
assert(k % QK_K == 0);
|
|
1629
|
-
block_q4_K *
|
|
1708
|
+
block_q4_K * GGML_RESTRICT y = vy;
|
|
1630
1709
|
quantize_row_q4_K_ref(x, y, k);
|
|
1631
1710
|
}
|
|
1632
1711
|
|
|
1633
1712
|
// ====================== 5-bit (de)-quantization
|
|
1634
1713
|
|
|
1635
|
-
void quantize_row_q5_K(const float *
|
|
1714
|
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1636
1715
|
assert(k % QK_K == 0);
|
|
1637
|
-
block_q5_K *
|
|
1716
|
+
block_q5_K * GGML_RESTRICT y = vy;
|
|
1638
1717
|
quantize_row_q5_K_ref(x, y, k);
|
|
1639
1718
|
}
|
|
1640
1719
|
|
|
1641
1720
|
// ====================== 6-bit (de)-quantization
|
|
1642
1721
|
|
|
1643
|
-
void quantize_row_q6_K(const float *
|
|
1722
|
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1644
1723
|
assert(k % QK_K == 0);
|
|
1645
|
-
block_q6_K *
|
|
1724
|
+
block_q6_K * GGML_RESTRICT y = vy;
|
|
1646
1725
|
quantize_row_q6_K_ref(x, y, k);
|
|
1647
1726
|
}
|
|
1648
1727
|
|
|
1649
1728
|
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
|
1650
1729
|
|
|
1651
|
-
void quantize_row_tq1_0(const float *
|
|
1730
|
+
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1652
1731
|
assert(k % QK_K == 0);
|
|
1653
|
-
block_tq1_0 *
|
|
1732
|
+
block_tq1_0 * GGML_RESTRICT y = vy;
|
|
1654
1733
|
quantize_row_tq1_0_ref(x, y, k);
|
|
1655
1734
|
}
|
|
1656
1735
|
|
|
1657
|
-
void quantize_row_tq2_0(const float *
|
|
1736
|
+
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
1658
1737
|
assert(k % QK_K == 0);
|
|
1659
|
-
block_tq2_0 *
|
|
1738
|
+
block_tq2_0 * GGML_RESTRICT y = vy;
|
|
1660
1739
|
quantize_row_tq2_0_ref(x, y, k);
|
|
1661
1740
|
}
|
|
1662
1741
|
|
|
@@ -1664,8 +1743,88 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
|
|
|
1664
1743
|
|
|
1665
1744
|
//===================================== Q8_K ==============================================
|
|
1666
1745
|
|
|
1667
|
-
void quantize_row_q8_K(const float *
|
|
1746
|
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
1747
|
+
#ifdef __wasm_simd128__
|
|
1748
|
+
assert(k % QK_K == 0);
|
|
1749
|
+
const int64_t nb = k / QK_K;
|
|
1750
|
+
block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
|
|
1751
|
+
|
|
1752
|
+
for (int i = 0; i < nb; i++) {
|
|
1753
|
+
const float * x_block = x + i * QK_K;
|
|
1754
|
+
|
|
1755
|
+
v128_t min_vec = wasm_v128_load(x_block);
|
|
1756
|
+
v128_t max_vec = min_vec;
|
|
1757
|
+
|
|
1758
|
+
for (int j = 4; j < QK_K; j += 4) {
|
|
1759
|
+
v128_t x_vec = wasm_v128_load(x_block + j);
|
|
1760
|
+
max_vec = wasm_f32x4_pmax(max_vec, x_vec);
|
|
1761
|
+
min_vec = wasm_f32x4_pmin(min_vec, x_vec);
|
|
1762
|
+
}
|
|
1763
|
+
max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
|
|
1764
|
+
max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
|
|
1765
|
+
min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
|
|
1766
|
+
min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
|
|
1767
|
+
float max = wasm_f32x4_extract_lane(max_vec, 0);
|
|
1768
|
+
float min = wasm_f32x4_extract_lane(min_vec, 0);
|
|
1769
|
+
float amax = -min > max ? min : max;
|
|
1770
|
+
|
|
1771
|
+
if (amax == 0.0f) {
|
|
1772
|
+
yc[i].d = 0.0f;
|
|
1773
|
+
const v128_t zero = wasm_i8x16_splat(0);
|
|
1774
|
+
for (int j = 0; j < QK_K; j += 16) {
|
|
1775
|
+
wasm_v128_store(yc[i].qs + j, zero);
|
|
1776
|
+
}
|
|
1777
|
+
continue;
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1780
|
+
const float iscale = -127.0f / amax;
|
|
1781
|
+
const v128_t scale_vec = wasm_f32x4_splat(iscale);
|
|
1782
|
+
|
|
1783
|
+
// Process 16 elements per iteration
|
|
1784
|
+
for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) {
|
|
1785
|
+
// Load and quantize 16 floats
|
|
1786
|
+
v128_t x0 = wasm_v128_load(x_block + j);
|
|
1787
|
+
v128_t x1 = wasm_v128_load(x_block + j + 4);
|
|
1788
|
+
v128_t x2 = wasm_v128_load(x_block + j + 8);
|
|
1789
|
+
v128_t x3 = wasm_v128_load(x_block + j + 12);
|
|
1790
|
+
|
|
1791
|
+
v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec));
|
|
1792
|
+
v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec));
|
|
1793
|
+
v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec));
|
|
1794
|
+
v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec));
|
|
1795
|
+
|
|
1796
|
+
// Convert to i32 with saturation
|
|
1797
|
+
v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0);
|
|
1798
|
+
v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1);
|
|
1799
|
+
v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2);
|
|
1800
|
+
v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3);
|
|
1801
|
+
|
|
1802
|
+
// Pack into 16 i8 values
|
|
1803
|
+
v128_t i8 = wasm_i8x16_narrow_i16x8(
|
|
1804
|
+
wasm_i16x8_narrow_i32x4(i0, i1),
|
|
1805
|
+
wasm_i16x8_narrow_i32x4(i2, i3)
|
|
1806
|
+
);
|
|
1807
|
+
wasm_v128_store(yc[i].qs + j, i8);
|
|
1808
|
+
|
|
1809
|
+
// Calculate bsums using SIMD
|
|
1810
|
+
v128_t sum16 = wasm_i16x8_add(
|
|
1811
|
+
wasm_i16x8_extend_low_i8x16(i8),
|
|
1812
|
+
wasm_i16x8_extend_high_i8x16(i8)
|
|
1813
|
+
);
|
|
1814
|
+
v128_t sum32 = wasm_i32x4_add(
|
|
1815
|
+
wasm_i32x4_extend_low_i16x8(sum16),
|
|
1816
|
+
wasm_i32x4_extend_high_i16x8(sum16)
|
|
1817
|
+
);
|
|
1818
|
+
sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1));
|
|
1819
|
+
sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2));
|
|
1820
|
+
yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0);
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
yc[i].d = 1.0f / iscale;
|
|
1824
|
+
}
|
|
1825
|
+
#else
|
|
1668
1826
|
quantize_row_q8_K_ref(x, y, k);
|
|
1827
|
+
#endif
|
|
1669
1828
|
}
|
|
1670
1829
|
|
|
1671
1830
|
//===================================== Dot products =================================
|
|
@@ -1750,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
|
|
|
1750
1909
|
}
|
|
1751
1910
|
#endif
|
|
1752
1911
|
|
|
1753
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float *
|
|
1912
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
1754
1913
|
const int qk = QK8_0;
|
|
1755
1914
|
const int nb = n / qk;
|
|
1756
1915
|
|
|
@@ -1765,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1765
1924
|
UNUSED(by);
|
|
1766
1925
|
UNUSED(bs);
|
|
1767
1926
|
|
|
1768
|
-
const block_q4_0 *
|
|
1769
|
-
const block_q8_0 *
|
|
1927
|
+
const block_q4_0 * GGML_RESTRICT x = vx;
|
|
1928
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
1770
1929
|
|
|
1771
1930
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
1772
1931
|
if (nrc == 2) {
|
|
1773
|
-
const block_q4_0 *
|
|
1774
|
-
const block_q4_0 *
|
|
1775
|
-
const block_q8_0 *
|
|
1776
|
-
const block_q8_0 *
|
|
1932
|
+
const block_q4_0 * GGML_RESTRICT vx0 = vx;
|
|
1933
|
+
const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
|
1934
|
+
const block_q8_0 * GGML_RESTRICT vy0 = vy;
|
|
1935
|
+
const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
|
1777
1936
|
|
|
1778
1937
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
1779
1938
|
|
|
1780
1939
|
for (int i = 0; i < nb; i++) {
|
|
1781
|
-
const block_q4_0 *
|
|
1782
|
-
const block_q4_0 *
|
|
1783
|
-
const block_q8_0 *
|
|
1784
|
-
const block_q8_0 *
|
|
1940
|
+
const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
|
|
1941
|
+
const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
|
|
1942
|
+
const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
|
|
1943
|
+
const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
1785
1944
|
|
|
1786
1945
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
1787
1946
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
|
@@ -1858,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1858
2017
|
const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
|
|
1859
2018
|
|
|
1860
2019
|
for (; ib + 1 < nb; ib += 2) {
|
|
1861
|
-
const block_q4_0 *
|
|
1862
|
-
const block_q4_0 *
|
|
1863
|
-
const block_q8_0 *
|
|
1864
|
-
const block_q8_0 *
|
|
2020
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2021
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2022
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2023
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
1865
2024
|
|
|
1866
2025
|
// load x
|
|
1867
2026
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
|
@@ -1904,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1904
2063
|
const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
|
|
1905
2064
|
|
|
1906
2065
|
for (; ib + 1 < nb; ib += 2) {
|
|
1907
|
-
const block_q4_0 *
|
|
1908
|
-
const block_q4_0 *
|
|
1909
|
-
const block_q8_0 *
|
|
1910
|
-
const block_q8_0 *
|
|
2066
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2067
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2068
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2069
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
1911
2070
|
|
|
1912
2071
|
// load x
|
|
1913
2072
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
|
@@ -1945,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1945
2104
|
const svbool_t pl16 = svnot_b_z(ph32, ph16);
|
|
1946
2105
|
|
|
1947
2106
|
for (; ib + 1 < nb; ib += 2) {
|
|
1948
|
-
const block_q4_0 *
|
|
1949
|
-
const block_q4_0 *
|
|
1950
|
-
const block_q8_0 *
|
|
1951
|
-
const block_q8_0 *
|
|
2107
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2108
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2109
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2110
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
1952
2111
|
|
|
1953
2112
|
// load x
|
|
1954
2113
|
const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
|
|
@@ -1985,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
1985
2144
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
|
1986
2145
|
|
|
1987
2146
|
for (; ib + 1 < nb; ib += 2) {
|
|
1988
|
-
const block_q4_0 *
|
|
1989
|
-
const block_q4_0 *
|
|
1990
|
-
const block_q8_0 *
|
|
1991
|
-
const block_q8_0 *
|
|
2147
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2148
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2149
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2150
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
1992
2151
|
|
|
1993
2152
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
1994
2153
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
|
@@ -2023,6 +2182,94 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2023
2182
|
}
|
|
2024
2183
|
|
|
2025
2184
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
2185
|
+
#elif defined __wasm_simd128__
|
|
2186
|
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
|
2187
|
+
|
|
2188
|
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
|
2189
|
+
const v128_t s8b = wasm_i8x16_splat(0x8);
|
|
2190
|
+
|
|
2191
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
2192
|
+
const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
|
|
2193
|
+
const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2194
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
2195
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2196
|
+
|
|
2197
|
+
// Load and process x0
|
|
2198
|
+
v128_t v0_0 = wasm_v128_load(x0->qs);
|
|
2199
|
+
v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
|
2200
|
+
v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
|
|
2201
|
+
v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
|
|
2202
|
+
v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
|
|
2203
|
+
|
|
2204
|
+
// Load y0 vectors
|
|
2205
|
+
v128_t y0_l = wasm_v128_load(y0->qs);
|
|
2206
|
+
v128_t y0_h = wasm_v128_load(y0->qs + 16);
|
|
2207
|
+
|
|
2208
|
+
// Extend to i16x8 and compute dot products
|
|
2209
|
+
v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls);
|
|
2210
|
+
v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls);
|
|
2211
|
+
v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs);
|
|
2212
|
+
v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs);
|
|
2213
|
+
|
|
2214
|
+
v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l);
|
|
2215
|
+
v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l);
|
|
2216
|
+
v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h);
|
|
2217
|
+
v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h);
|
|
2218
|
+
|
|
2219
|
+
v128_t dp0 = wasm_i32x4_add(
|
|
2220
|
+
wasm_i32x4_add(
|
|
2221
|
+
wasm_i32x4_dot_i16x8(dx0l, dy0ll),
|
|
2222
|
+
wasm_i32x4_dot_i16x8(dx0h, dy0lh)
|
|
2223
|
+
),
|
|
2224
|
+
wasm_i32x4_add(
|
|
2225
|
+
wasm_i32x4_dot_i16x8(dx0hl, dy0hl),
|
|
2226
|
+
wasm_i32x4_dot_i16x8(dx0hh, dy0hh)
|
|
2227
|
+
)
|
|
2228
|
+
);
|
|
2229
|
+
|
|
2230
|
+
// Load and process x1
|
|
2231
|
+
v128_t v0_1 = wasm_v128_load(x1->qs);
|
|
2232
|
+
v128_t v0_1l = wasm_v128_and(v0_1, m4b);
|
|
2233
|
+
v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
|
|
2234
|
+
v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
|
|
2235
|
+
v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
|
|
2236
|
+
|
|
2237
|
+
// Load y1 vectors
|
|
2238
|
+
v128_t y1_l = wasm_v128_load(y1->qs);
|
|
2239
|
+
v128_t y1_h = wasm_v128_load(y1->qs + 16);
|
|
2240
|
+
|
|
2241
|
+
// Extend to i16x8 and compute dot products
|
|
2242
|
+
v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls);
|
|
2243
|
+
v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls);
|
|
2244
|
+
v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs);
|
|
2245
|
+
v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs);
|
|
2246
|
+
|
|
2247
|
+
v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l);
|
|
2248
|
+
v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l);
|
|
2249
|
+
v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h);
|
|
2250
|
+
v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h);
|
|
2251
|
+
|
|
2252
|
+
v128_t dp1 = wasm_i32x4_add(
|
|
2253
|
+
wasm_i32x4_add(
|
|
2254
|
+
wasm_i32x4_dot_i16x8(dx1l, dy1ll),
|
|
2255
|
+
wasm_i32x4_dot_i16x8(dx1h, dy1lh)
|
|
2256
|
+
),
|
|
2257
|
+
wasm_i32x4_add(
|
|
2258
|
+
wasm_i32x4_dot_i16x8(dx1hl, dy1hl),
|
|
2259
|
+
wasm_i32x4_dot_i16x8(dx1hh, dy1hh)
|
|
2260
|
+
)
|
|
2261
|
+
);
|
|
2262
|
+
|
|
2263
|
+
// Accumulate results with scaling
|
|
2264
|
+
float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
|
|
2265
|
+
float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d);
|
|
2266
|
+
|
|
2267
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
|
|
2268
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
|
|
2269
|
+
}
|
|
2270
|
+
|
|
2271
|
+
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
2272
|
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
|
2026
2273
|
#elif defined(__AVX2__)
|
|
2027
2274
|
// Initialize accumulator with zeros
|
|
2028
2275
|
__m256 acc = _mm256_setzero_ps();
|
|
@@ -2311,6 +2558,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2311
2558
|
}
|
|
2312
2559
|
|
|
2313
2560
|
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
|
2561
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
2562
|
+
__vector float acc = vec_splats(0.0f);
|
|
2563
|
+
|
|
2564
|
+
const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
|
|
2565
|
+
const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
|
|
2566
|
+
|
|
2567
|
+
for (; ib < nb; ++ib) {
|
|
2568
|
+
const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
|
|
2569
|
+
const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
|
|
2570
|
+
const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
|
|
2571
|
+
|
|
2572
|
+
const __vector int8_t v_xls = vec_sub(v_xl, v_s);
|
|
2573
|
+
const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
|
|
2574
|
+
|
|
2575
|
+
const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
|
|
2576
|
+
const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
|
2577
|
+
|
|
2578
|
+
const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
|
|
2579
|
+
const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
|
|
2580
|
+
const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
|
2581
|
+
const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
|
|
2582
|
+
|
|
2583
|
+
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
|
2584
|
+
|
|
2585
|
+
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
|
|
2586
|
+
const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
2587
|
+
|
|
2588
|
+
acc = vec_madd(v_xy, v_d, acc);
|
|
2589
|
+
}
|
|
2590
|
+
|
|
2591
|
+
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
|
2314
2592
|
#endif
|
|
2315
2593
|
for (; ib < nb; ++ib) {
|
|
2316
2594
|
int sumi0 = 0;
|
|
@@ -2331,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2331
2609
|
*s = sumf;
|
|
2332
2610
|
}
|
|
2333
2611
|
|
|
2334
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float *
|
|
2612
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
2335
2613
|
const int qk = QK8_1;
|
|
2336
2614
|
const int nb = n / qk;
|
|
2337
2615
|
|
|
@@ -2346,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2346
2624
|
UNUSED(by);
|
|
2347
2625
|
UNUSED(bs);
|
|
2348
2626
|
|
|
2349
|
-
const block_q4_1 *
|
|
2350
|
-
const block_q8_1 *
|
|
2627
|
+
const block_q4_1 * GGML_RESTRICT x = vx;
|
|
2628
|
+
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
2351
2629
|
|
|
2352
2630
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
2353
2631
|
if (nrc == 2) {
|
|
2354
|
-
const block_q4_1 *
|
|
2355
|
-
const block_q4_1 *
|
|
2356
|
-
const block_q8_1 *
|
|
2357
|
-
const block_q8_1 *
|
|
2632
|
+
const block_q4_1 * GGML_RESTRICT vx0 = vx;
|
|
2633
|
+
const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
|
2634
|
+
const block_q8_1 * GGML_RESTRICT vy0 = vy;
|
|
2635
|
+
const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
|
2358
2636
|
|
|
2359
2637
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
2360
2638
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
|
2361
2639
|
|
|
2362
2640
|
for (int i = 0; i < nb; i++) {
|
|
2363
|
-
const block_q4_1 *
|
|
2364
|
-
const block_q4_1 *
|
|
2365
|
-
const block_q8_1 *
|
|
2366
|
-
const block_q8_1 *
|
|
2641
|
+
const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
|
|
2642
|
+
const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
|
|
2643
|
+
const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
|
|
2644
|
+
const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
2367
2645
|
|
|
2368
2646
|
float32_t summs_t[4] = {
|
|
2369
2647
|
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
|
@@ -2437,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2437
2715
|
float summs = 0;
|
|
2438
2716
|
|
|
2439
2717
|
for (; ib + 1 < nb; ib += 2) {
|
|
2440
|
-
const block_q4_1 *
|
|
2441
|
-
const block_q4_1 *
|
|
2442
|
-
const block_q8_1 *
|
|
2443
|
-
const block_q8_1 *
|
|
2718
|
+
const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
2719
|
+
const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2720
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
2721
|
+
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2444
2722
|
|
|
2445
2723
|
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
|
2446
2724
|
|
|
@@ -2604,6 +2882,35 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2604
2882
|
}
|
|
2605
2883
|
|
|
2606
2884
|
sumf = hsum_float_8(acc) + summs;
|
|
2885
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
2886
|
+
float summs = 0;
|
|
2887
|
+
float32x4_t acc = vec_splats(0.0f);
|
|
2888
|
+
|
|
2889
|
+
const uint8x16_t v_m = vec_splat_u8(0x0F);
|
|
2890
|
+
|
|
2891
|
+
#pragma GCC unroll 4
|
|
2892
|
+
for (; ib < nb; ++ib) {
|
|
2893
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
2894
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
2895
|
+
|
|
2896
|
+
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
|
2897
|
+
|
|
2898
|
+
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
|
2899
|
+
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
|
2900
|
+
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
|
|
2901
|
+
|
|
2902
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
|
2903
|
+
const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
|
|
2904
|
+
|
|
2905
|
+
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
2906
|
+
const float32x4_t v_xy = vec_float(v_xy_);
|
|
2907
|
+
|
|
2908
|
+
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
2909
|
+
|
|
2910
|
+
acc = vec_madd(v_xy, v_d, acc);
|
|
2911
|
+
}
|
|
2912
|
+
|
|
2913
|
+
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
|
|
2607
2914
|
#endif
|
|
2608
2915
|
for (; ib < nb; ++ib) {
|
|
2609
2916
|
int sumi0 = 0;
|
|
@@ -2624,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2624
2931
|
*s = sumf;
|
|
2625
2932
|
}
|
|
2626
2933
|
|
|
2627
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float *
|
|
2934
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
2628
2935
|
const int qk = QK8_0;
|
|
2629
2936
|
const int nb = n / qk;
|
|
2630
2937
|
|
|
@@ -2639,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2639
2946
|
UNUSED(by);
|
|
2640
2947
|
UNUSED(bs);
|
|
2641
2948
|
|
|
2642
|
-
const block_q5_0 *
|
|
2643
|
-
const block_q8_0 *
|
|
2949
|
+
const block_q5_0 * GGML_RESTRICT x = vx;
|
|
2950
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
2644
2951
|
|
|
2645
2952
|
#if defined(__ARM_NEON)
|
|
2646
2953
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
@@ -2653,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2653
2960
|
uint64_t tmp1[4];
|
|
2654
2961
|
|
|
2655
2962
|
for (; ib + 1 < nb; ib += 2) {
|
|
2656
|
-
const block_q5_0 *
|
|
2657
|
-
const block_q5_0 *
|
|
2658
|
-
const block_q8_0 *
|
|
2659
|
-
const block_q8_0 *
|
|
2963
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
|
|
2964
|
+
const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
2965
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
2966
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
2660
2967
|
|
|
2661
2968
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
2662
2969
|
|
|
@@ -2709,26 +3016,26 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2709
3016
|
}
|
|
2710
3017
|
|
|
2711
3018
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
2712
|
-
#elif defined
|
|
3019
|
+
#elif defined __wasm_simd128__
|
|
2713
3020
|
v128_t sumv = wasm_f32x4_splat(0.0f);
|
|
2714
3021
|
|
|
2715
|
-
uint32_t
|
|
3022
|
+
uint32_t qh_;
|
|
2716
3023
|
uint64_t tmp[4];
|
|
2717
3024
|
|
|
2718
3025
|
// TODO: check if unrolling this is better
|
|
2719
3026
|
for (; ib < nb; ++ib) {
|
|
2720
|
-
const block_q5_0 *
|
|
2721
|
-
const block_q8_0 *
|
|
3027
|
+
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
|
|
3028
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
2722
3029
|
|
|
2723
3030
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
|
2724
3031
|
|
|
2725
3032
|
// extract the 5th bit
|
|
2726
|
-
memcpy(&
|
|
3033
|
+
memcpy(&qh_, x0->qh, sizeof(qh_));
|
|
2727
3034
|
|
|
2728
|
-
tmp[0] = table_b2b_1[(
|
|
2729
|
-
tmp[1] = table_b2b_1[(
|
|
2730
|
-
tmp[2] = table_b2b_1[(
|
|
2731
|
-
tmp[3] = table_b2b_1[(
|
|
3035
|
+
tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF];
|
|
3036
|
+
tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF];
|
|
3037
|
+
tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF];
|
|
3038
|
+
tmp[3] = table_b2b_1[(qh_ >> 24) ];
|
|
2732
3039
|
|
|
2733
3040
|
const v128_t qhl = wasm_v128_load(tmp + 0);
|
|
2734
3041
|
const v128_t qhh = wasm_v128_load(tmp + 2);
|
|
@@ -2979,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2979
3286
|
*s = sumf;
|
|
2980
3287
|
}
|
|
2981
3288
|
|
|
2982
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float *
|
|
3289
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
2983
3290
|
const int qk = QK8_1;
|
|
2984
3291
|
const int nb = n / qk;
|
|
2985
3292
|
|
|
@@ -2994,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
2994
3301
|
UNUSED(by);
|
|
2995
3302
|
UNUSED(bs);
|
|
2996
3303
|
|
|
2997
|
-
const block_q5_1 *
|
|
2998
|
-
const block_q8_1 *
|
|
3304
|
+
const block_q5_1 * GGML_RESTRICT x = vx;
|
|
3305
|
+
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
2999
3306
|
|
|
3000
3307
|
#if defined(__ARM_NEON)
|
|
3001
3308
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
@@ -3011,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3011
3318
|
uint64_t tmp1[4];
|
|
3012
3319
|
|
|
3013
3320
|
for (; ib + 1 < nb; ib += 2) {
|
|
3014
|
-
const block_q5_1 *
|
|
3015
|
-
const block_q5_1 *
|
|
3016
|
-
const block_q8_1 *
|
|
3017
|
-
const block_q8_1 *
|
|
3321
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
|
3322
|
+
const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3323
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
|
3324
|
+
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3018
3325
|
|
|
3019
3326
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
3020
3327
|
|
|
@@ -3070,30 +3377,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3070
3377
|
}
|
|
3071
3378
|
|
|
3072
3379
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
|
3073
|
-
#elif defined
|
|
3380
|
+
#elif defined __wasm_simd128__
|
|
3074
3381
|
v128_t sumv = wasm_f32x4_splat(0.0f);
|
|
3075
3382
|
|
|
3076
3383
|
float summs = 0.0f;
|
|
3077
3384
|
|
|
3078
|
-
uint32_t
|
|
3385
|
+
uint32_t qh_;
|
|
3079
3386
|
uint64_t tmp[4];
|
|
3080
3387
|
|
|
3081
3388
|
// TODO: check if unrolling this is better
|
|
3082
3389
|
for (; ib < nb; ++ib) {
|
|
3083
|
-
const block_q5_1 *
|
|
3084
|
-
const block_q8_1 *
|
|
3390
|
+
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
|
3391
|
+
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
|
3085
3392
|
|
|
3086
3393
|
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
|
3087
3394
|
|
|
3088
3395
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
|
3089
3396
|
|
|
3090
3397
|
// extract the 5th bit
|
|
3091
|
-
memcpy(&
|
|
3398
|
+
memcpy(&qh_, x0->qh, sizeof(qh_));
|
|
3092
3399
|
|
|
3093
|
-
tmp[0] = table_b2b_0[(
|
|
3094
|
-
tmp[1] = table_b2b_0[(
|
|
3095
|
-
tmp[2] = table_b2b_0[(
|
|
3096
|
-
tmp[3] = table_b2b_0[(
|
|
3400
|
+
tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF];
|
|
3401
|
+
tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF];
|
|
3402
|
+
tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF];
|
|
3403
|
+
tmp[3] = table_b2b_0[(qh_ >> 24) ];
|
|
3097
3404
|
|
|
3098
3405
|
const v128_t qhl = wasm_v128_load(tmp + 0);
|
|
3099
3406
|
const v128_t qhh = wasm_v128_load(tmp + 2);
|
|
@@ -3353,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
|
3353
3660
|
*s = sumf;
|
|
3354
3661
|
}
|
|
3355
3662
|
|
|
3356
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float *
|
|
3663
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3357
3664
|
const int qk = QK8_0;
|
|
3358
3665
|
const int nb = n / qk;
|
|
3359
3666
|
|
|
@@ -3368,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3368
3675
|
UNUSED(by);
|
|
3369
3676
|
UNUSED(bs);
|
|
3370
3677
|
|
|
3371
|
-
const block_q8_0 *
|
|
3372
|
-
const block_q8_0 *
|
|
3678
|
+
const block_q8_0 * GGML_RESTRICT x = vx;
|
|
3679
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
3373
3680
|
|
|
3374
3681
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
3375
3682
|
if (nrc == 2) {
|
|
3376
|
-
const block_q8_0 *
|
|
3377
|
-
const block_q8_0 *
|
|
3378
|
-
const block_q8_0 *
|
|
3379
|
-
const block_q8_0 *
|
|
3683
|
+
const block_q8_0 * GGML_RESTRICT vx0 = vx;
|
|
3684
|
+
const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
|
3685
|
+
const block_q8_0 * GGML_RESTRICT vy0 = vy;
|
|
3686
|
+
const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
|
3380
3687
|
|
|
3381
3688
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
|
3382
3689
|
|
|
3383
3690
|
for (int i = 0; i < nb; i++) {
|
|
3384
|
-
const block_q8_0 *
|
|
3385
|
-
const block_q8_0 *
|
|
3691
|
+
const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
|
|
3692
|
+
const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
|
|
3386
3693
|
|
|
3387
|
-
const block_q8_0 *
|
|
3388
|
-
const block_q8_0 *
|
|
3694
|
+
const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
|
|
3695
|
+
const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
|
|
3389
3696
|
|
|
3390
3697
|
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
|
3391
3698
|
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
|
@@ -3450,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3450
3757
|
const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
|
|
3451
3758
|
|
|
3452
3759
|
for (; ib + 1 < nb; ib += 2) {
|
|
3453
|
-
const block_q8_0 *
|
|
3454
|
-
const block_q8_0 *
|
|
3455
|
-
const block_q8_0 *
|
|
3456
|
-
const block_q8_0 *
|
|
3760
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3761
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3762
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3763
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3457
3764
|
|
|
3458
3765
|
// load x
|
|
3459
3766
|
const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
|
|
@@ -3481,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3481
3788
|
{
|
|
3482
3789
|
//printf("sve256");
|
|
3483
3790
|
for (; ib + 1 < nb; ib += 2) {
|
|
3484
|
-
const block_q8_0 *
|
|
3485
|
-
const block_q8_0 *
|
|
3486
|
-
const block_q8_0 *
|
|
3487
|
-
const block_q8_0 *
|
|
3791
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3792
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3793
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3794
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3488
3795
|
|
|
3489
3796
|
// load x
|
|
3490
3797
|
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
|
@@ -3517,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3517
3824
|
svfloat32_t sumv00 = svdup_n_f32(0.0f);
|
|
3518
3825
|
|
|
3519
3826
|
for (; ib + 1 < nb; ib += 2) {
|
|
3520
|
-
const block_q8_0 *
|
|
3521
|
-
const block_q8_0 *
|
|
3522
|
-
const block_q8_0 *
|
|
3523
|
-
const block_q8_0 *
|
|
3827
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3828
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3829
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3830
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3524
3831
|
|
|
3525
3832
|
//load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
|
|
3526
3833
|
// and add them to make one 64 element vector
|
|
@@ -3560,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3560
3867
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
|
3561
3868
|
|
|
3562
3869
|
for (; ib + 1 < nb; ib += 2) {
|
|
3563
|
-
const block_q8_0 *
|
|
3564
|
-
const block_q8_0 *
|
|
3565
|
-
const block_q8_0 *
|
|
3566
|
-
const block_q8_0 *
|
|
3870
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
3871
|
+
const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
3872
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
3873
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
3567
3874
|
|
|
3568
3875
|
const int8x16_t x0_0 = vld1q_s8(x0->qs);
|
|
3569
3876
|
const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
|
|
@@ -3586,6 +3893,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3586
3893
|
}
|
|
3587
3894
|
|
|
3588
3895
|
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
|
3896
|
+
#elif defined __wasm_simd128__
|
|
3897
|
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
|
3898
|
+
|
|
3899
|
+
for (; ib < nb; ++ib) {
|
|
3900
|
+
const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
|
|
3901
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
3902
|
+
|
|
3903
|
+
const v128_t x0_0 = wasm_v128_load(x0->qs);
|
|
3904
|
+
const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
|
|
3905
|
+
const v128_t y0_0 = wasm_v128_load(y0->qs);
|
|
3906
|
+
const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
|
|
3907
|
+
|
|
3908
|
+
// Extend 8-bit to 16-bit
|
|
3909
|
+
const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0);
|
|
3910
|
+
const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0);
|
|
3911
|
+
const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1);
|
|
3912
|
+
const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1);
|
|
3913
|
+
|
|
3914
|
+
const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0);
|
|
3915
|
+
const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0);
|
|
3916
|
+
const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1);
|
|
3917
|
+
const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1);
|
|
3918
|
+
|
|
3919
|
+
// Compute dot products
|
|
3920
|
+
const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l);
|
|
3921
|
+
const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h);
|
|
3922
|
+
const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l);
|
|
3923
|
+
const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h);
|
|
3924
|
+
|
|
3925
|
+
// Sum all dot products
|
|
3926
|
+
const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
|
|
3927
|
+
|
|
3928
|
+
// Convert to float and accumulate
|
|
3929
|
+
const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
|
|
3930
|
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
|
|
3931
|
+
}
|
|
3932
|
+
|
|
3933
|
+
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
3934
|
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
|
3589
3935
|
#elif defined(__AVX2__)
|
|
3590
3936
|
// Initialize accumulator with zeros
|
|
3591
3937
|
__m256 acc = _mm256_setzero_ps();
|
|
@@ -3699,6 +4045,27 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3699
4045
|
}
|
|
3700
4046
|
|
|
3701
4047
|
sumf = hsum_float_8(acc);
|
|
4048
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
4049
|
+
__vector float acc = vec_splats(0.0f);
|
|
4050
|
+
|
|
4051
|
+
#pragma GCC unroll 8
|
|
4052
|
+
for (; ib < nb; ++ib) {
|
|
4053
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
4054
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
4055
|
+
|
|
4056
|
+
const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
|
|
4057
|
+
const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
|
|
4058
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
|
4059
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
|
4060
|
+
|
|
4061
|
+
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
4062
|
+
const float32x4_t v_xy = vec_float(v_xy_);
|
|
4063
|
+
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
4064
|
+
|
|
4065
|
+
acc = vec_madd(v_xy, v_d, acc);
|
|
4066
|
+
}
|
|
4067
|
+
|
|
4068
|
+
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
|
3702
4069
|
#endif
|
|
3703
4070
|
for (; ib < nb; ++ib) {
|
|
3704
4071
|
int sumi = 0;
|
|
@@ -3713,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
3713
4080
|
*s = sumf;
|
|
3714
4081
|
}
|
|
3715
4082
|
|
|
3716
|
-
void ggml_vec_dot_tq1_0_q8_K(int n, float *
|
|
4083
|
+
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3717
4084
|
assert(nrc == 1);
|
|
3718
4085
|
UNUSED(nrc);
|
|
3719
4086
|
UNUSED(bx);
|
|
3720
4087
|
UNUSED(by);
|
|
3721
4088
|
UNUSED(bs);
|
|
3722
4089
|
|
|
3723
|
-
const block_tq1_0 *
|
|
3724
|
-
const block_q8_K *
|
|
4090
|
+
const block_tq1_0 * GGML_RESTRICT x = vx;
|
|
4091
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
3725
4092
|
|
|
3726
4093
|
const int nb = n / QK_K;
|
|
3727
4094
|
|
|
@@ -4036,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
4036
4403
|
#endif
|
|
4037
4404
|
}
|
|
4038
4405
|
|
|
4039
|
-
void ggml_vec_dot_tq2_0_q8_K(int n, float *
|
|
4406
|
+
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
4040
4407
|
assert(nrc == 1);
|
|
4041
4408
|
UNUSED(nrc);
|
|
4042
4409
|
UNUSED(bx);
|
|
4043
4410
|
UNUSED(by);
|
|
4044
4411
|
UNUSED(bs);
|
|
4045
4412
|
|
|
4046
|
-
const block_tq2_0 *
|
|
4047
|
-
const block_q8_K *
|
|
4413
|
+
const block_tq2_0 * GGML_RESTRICT x = vx;
|
|
4414
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
4048
4415
|
|
|
4049
4416
|
const int nb = n / QK_K;
|
|
4050
4417
|
|
|
@@ -4208,19 +4575,264 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
4208
4575
|
#endif
|
|
4209
4576
|
}
|
|
4210
4577
|
|
|
4211
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float *
|
|
4578
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
4212
4579
|
assert(nrc == 1);
|
|
4213
4580
|
UNUSED(nrc);
|
|
4214
4581
|
UNUSED(bx);
|
|
4215
4582
|
UNUSED(by);
|
|
4216
4583
|
UNUSED(bs);
|
|
4217
4584
|
|
|
4218
|
-
const block_q2_K *
|
|
4219
|
-
const block_q8_K *
|
|
4585
|
+
const block_q2_K * GGML_RESTRICT x = vx;
|
|
4586
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
4220
4587
|
|
|
4221
4588
|
const int nb = n / QK_K;
|
|
4222
4589
|
|
|
4223
|
-
#ifdef
|
|
4590
|
+
#ifdef __ARM_FEATURE_SVE
|
|
4591
|
+
const int vector_length = svcntb()*8;
|
|
4592
|
+
const svuint8_t m3s = svdup_n_u8(0x3);
|
|
4593
|
+
const svuint32_t m4s = svdup_n_u32(0xF);
|
|
4594
|
+
const svint32_t vzero_sv = svdup_n_s32(0);
|
|
4595
|
+
svfloat32_t acc_sum = svdup_n_f32(0);
|
|
4596
|
+
svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
|
|
4597
|
+
|
|
4598
|
+
switch (vector_length) {
|
|
4599
|
+
case 128:
|
|
4600
|
+
for (int i = 0; i < nb; ++i) {
|
|
4601
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4602
|
+
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
|
4603
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4604
|
+
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
|
4605
|
+
|
|
4606
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4607
|
+
const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
|
|
4608
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
4609
|
+
|
|
4610
|
+
svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
|
|
4611
|
+
const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4612
|
+
|
|
4613
|
+
mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
|
|
4614
|
+
const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4615
|
+
|
|
4616
|
+
svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
|
|
4617
|
+
svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
|
|
4618
|
+
|
|
4619
|
+
const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
|
|
4620
|
+
|
|
4621
|
+
mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
|
|
4622
|
+
const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4623
|
+
|
|
4624
|
+
mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
|
|
4625
|
+
const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
|
|
4626
|
+
|
|
4627
|
+
q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
|
|
4628
|
+
q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
|
|
4629
|
+
|
|
4630
|
+
svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
|
|
4631
|
+
|
|
4632
|
+
svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
|
|
4633
|
+
|
|
4634
|
+
acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
|
|
4635
|
+
|
|
4636
|
+
svint32_t sumi1 = svdup_n_s32(0);
|
|
4637
|
+
|
|
4638
|
+
{
|
|
4639
|
+
const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
|
|
4640
|
+
svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
|
|
4641
|
+
svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4642
|
+
const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
|
|
4643
|
+
|
|
4644
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
|
|
4645
|
+
|
|
4646
|
+
const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
|
|
4647
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
|
|
4648
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4649
|
+
|
|
4650
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
|
|
4651
|
+
|
|
4652
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
|
|
4653
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4654
|
+
|
|
4655
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
|
|
4656
|
+
|
|
4657
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
|
|
4658
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4659
|
+
|
|
4660
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
|
|
4661
|
+
|
|
4662
|
+
|
|
4663
|
+
const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
|
|
4664
|
+
|
|
4665
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
|
|
4666
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4667
|
+
|
|
4668
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
|
|
4669
|
+
|
|
4670
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
|
|
4671
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4672
|
+
|
|
4673
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
|
|
4674
|
+
|
|
4675
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
|
|
4676
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4677
|
+
|
|
4678
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
|
|
4679
|
+
|
|
4680
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
|
|
4681
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4682
|
+
|
|
4683
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
|
|
4684
|
+
|
|
4685
|
+
//-------------------------------
|
|
4686
|
+
|
|
4687
|
+
q2 += 32;
|
|
4688
|
+
const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
|
|
4689
|
+
const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
|
|
4690
|
+
|
|
4691
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
|
|
4692
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4693
|
+
|
|
4694
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
|
|
4695
|
+
|
|
4696
|
+
const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
|
|
4697
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
|
|
4698
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4699
|
+
|
|
4700
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
|
|
4701
|
+
|
|
4702
|
+
|
|
4703
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
|
|
4704
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4705
|
+
|
|
4706
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
|
|
4707
|
+
|
|
4708
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
|
|
4709
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4710
|
+
|
|
4711
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
|
|
4712
|
+
|
|
4713
|
+
|
|
4714
|
+
const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
|
|
4715
|
+
|
|
4716
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
|
|
4717
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4718
|
+
|
|
4719
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
|
|
4720
|
+
|
|
4721
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
|
|
4722
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4723
|
+
|
|
4724
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
|
|
4725
|
+
|
|
4726
|
+
|
|
4727
|
+
|
|
4728
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
|
|
4729
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4730
|
+
|
|
4731
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
|
|
4732
|
+
|
|
4733
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
|
|
4734
|
+
q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
4735
|
+
|
|
4736
|
+
sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
|
|
4737
|
+
}
|
|
4738
|
+
acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
|
|
4739
|
+
}
|
|
4740
|
+
*s = svaddv_f32(svptrue_b32(), acc_sum);
|
|
4741
|
+
break;
|
|
4742
|
+
|
|
4743
|
+
case 256:
|
|
4744
|
+
case 512:
|
|
4745
|
+
for (int i = 0; i < nb; ++i) {
|
|
4746
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4747
|
+
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
|
4748
|
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4749
|
+
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
|
4750
|
+
|
|
4751
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4752
|
+
const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
|
|
4753
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
4754
|
+
|
|
4755
|
+
const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
|
|
4756
|
+
const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
|
|
4757
|
+
const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
|
|
4758
|
+
svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
|
|
4759
|
+
|
|
4760
|
+
const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
|
|
4761
|
+
const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
|
|
4762
|
+
const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
|
|
4763
|
+
|
|
4764
|
+
svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
|
|
4765
|
+
|
|
4766
|
+
svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
|
|
4767
|
+
|
|
4768
|
+
acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
|
|
4769
|
+
|
|
4770
|
+
svint32_t sumi1 = svdup_n_s32(0);
|
|
4771
|
+
|
|
4772
|
+
{
|
|
4773
|
+
const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
|
|
4774
|
+
svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
|
|
4775
|
+
svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4776
|
+
|
|
4777
|
+
svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
|
|
4778
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4779
|
+
|
|
4780
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
|
|
4781
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4782
|
+
|
|
4783
|
+
svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
|
|
4784
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
|
|
4785
|
+
|
|
4786
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
|
|
4787
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4788
|
+
|
|
4789
|
+
scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
|
|
4790
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4791
|
+
|
|
4792
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
|
|
4793
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4794
|
+
|
|
4795
|
+
scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
|
|
4796
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
|
|
4797
|
+
|
|
4798
|
+
q2 += 32;
|
|
4799
|
+
|
|
4800
|
+
const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
|
|
4801
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
|
|
4802
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4803
|
+
|
|
4804
|
+
scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
|
|
4805
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4806
|
+
|
|
4807
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
|
|
4808
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4809
|
+
|
|
4810
|
+
scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
|
|
4811
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
|
|
4812
|
+
|
|
4813
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
|
|
4814
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4815
|
+
|
|
4816
|
+
scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
|
|
4817
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
|
|
4818
|
+
|
|
4819
|
+
q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
|
|
4820
|
+
q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
4821
|
+
|
|
4822
|
+
scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
|
|
4823
|
+
sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
|
|
4824
|
+
}
|
|
4825
|
+
acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
|
|
4826
|
+
}
|
|
4827
|
+
*s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
|
|
4828
|
+
break;
|
|
4829
|
+
|
|
4830
|
+
default:
|
|
4831
|
+
assert(false && "Unsupported vector length");
|
|
4832
|
+
break;
|
|
4833
|
+
}
|
|
4834
|
+
|
|
4835
|
+
#elif __ARM_NEON
|
|
4224
4836
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
|
4225
4837
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
|
4226
4838
|
|
|
@@ -4235,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4235
4847
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4236
4848
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4237
4849
|
|
|
4238
|
-
const uint8_t *
|
|
4239
|
-
const int8_t *
|
|
4240
|
-
const uint8_t *
|
|
4850
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4851
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4852
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
4241
4853
|
|
|
4242
4854
|
const uint8x16_t mins_and_scales = vld1q_u8(sc);
|
|
4243
4855
|
const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
|
|
@@ -4300,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4300
4912
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4301
4913
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4302
4914
|
|
|
4303
|
-
const uint8_t *
|
|
4304
|
-
const int8_t *
|
|
4915
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4916
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4305
4917
|
|
|
4306
4918
|
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
4307
4919
|
const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
|
|
@@ -4367,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4367
4979
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4368
4980
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4369
4981
|
|
|
4370
|
-
const uint8_t *
|
|
4371
|
-
const int8_t *
|
|
4982
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
4983
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4372
4984
|
|
|
4373
4985
|
// load mins and scales from block_q2_K.scales[QK_K/16]
|
|
4374
4986
|
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
@@ -4460,6 +5072,106 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4460
5072
|
|
|
4461
5073
|
*s = hsum_float_8(acc);
|
|
4462
5074
|
|
|
5075
|
+
#elif defined __wasm_simd128__
|
|
5076
|
+
float sumf = 0;
|
|
5077
|
+
|
|
5078
|
+
for (int i = 0; i < nb; ++i) {
|
|
5079
|
+
const uint8_t * q2 = x[i].qs;
|
|
5080
|
+
const int8_t * q8 = y[i].qs;
|
|
5081
|
+
const uint8_t * sc = x[i].scales;
|
|
5082
|
+
|
|
5083
|
+
// Vectorized summs calculation
|
|
5084
|
+
v128_t summs_vec = wasm_i32x4_splat(0);
|
|
5085
|
+
{
|
|
5086
|
+
v128_t sc_vec = wasm_v128_load(sc);
|
|
5087
|
+
v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4);
|
|
5088
|
+
|
|
5089
|
+
v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper);
|
|
5090
|
+
v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper);
|
|
5091
|
+
|
|
5092
|
+
v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]);
|
|
5093
|
+
v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]);
|
|
5094
|
+
|
|
5095
|
+
summs_vec = wasm_i32x4_add(
|
|
5096
|
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1),
|
|
5097
|
+
wasm_i32x4_dot_i16x8(sc_high, bsums2)),
|
|
5098
|
+
summs_vec
|
|
5099
|
+
);
|
|
5100
|
+
|
|
5101
|
+
summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1));
|
|
5102
|
+
summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2));
|
|
5103
|
+
}
|
|
5104
|
+
int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0);
|
|
5105
|
+
|
|
5106
|
+
// Vectorized isum calculation
|
|
5107
|
+
int32_t isum = 0;
|
|
5108
|
+
const uint8_t * sc_ptr = sc;
|
|
5109
|
+
const int k_iters = QK_K/128;
|
|
5110
|
+
|
|
5111
|
+
for (int k = 0; k < k_iters; ++k) {
|
|
5112
|
+
v128_t isum_vec = wasm_i32x4_splat(0);
|
|
5113
|
+
int shift = 0;
|
|
5114
|
+
|
|
5115
|
+
for (int j = 0; j < 4; ++j) {
|
|
5116
|
+
const int d0 = (sc_ptr[0] & 0xF);
|
|
5117
|
+
const int d1 = (sc_ptr[1] & 0xF);
|
|
5118
|
+
sc_ptr += 2;
|
|
5119
|
+
|
|
5120
|
+
// Process first 16 elements
|
|
5121
|
+
v128_t q2_0 = wasm_v128_load(q2);
|
|
5122
|
+
v128_t q8_0 = wasm_v128_load(q8);
|
|
5123
|
+
v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift);
|
|
5124
|
+
v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03));
|
|
5125
|
+
|
|
5126
|
+
// Process next 16 elements
|
|
5127
|
+
v128_t q2_1 = wasm_v128_load(q2 + 16);
|
|
5128
|
+
v128_t q8_1 = wasm_v128_load(q8 + 16);
|
|
5129
|
+
v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift);
|
|
5130
|
+
v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03));
|
|
5131
|
+
|
|
5132
|
+
// Calculate dot products
|
|
5133
|
+
v128_t p0 = wasm_i32x4_dot_i16x8(
|
|
5134
|
+
wasm_i16x8_extend_low_i8x16(q8_0),
|
|
5135
|
+
wasm_i16x8_extend_low_i8x16(q2_bits_0)
|
|
5136
|
+
);
|
|
5137
|
+
v128_t p1 = wasm_i32x4_dot_i16x8(
|
|
5138
|
+
wasm_i16x8_extend_high_i8x16(q8_0),
|
|
5139
|
+
wasm_i16x8_extend_high_i8x16(q2_bits_0)
|
|
5140
|
+
);
|
|
5141
|
+
v128_t p2 = wasm_i32x4_dot_i16x8(
|
|
5142
|
+
wasm_i16x8_extend_low_i8x16(q8_1),
|
|
5143
|
+
wasm_i16x8_extend_low_i8x16(q2_bits_1)
|
|
5144
|
+
);
|
|
5145
|
+
v128_t p3 = wasm_i32x4_dot_i16x8(
|
|
5146
|
+
wasm_i16x8_extend_high_i8x16(q8_1),
|
|
5147
|
+
wasm_i16x8_extend_high_i8x16(q2_bits_1)
|
|
5148
|
+
);
|
|
5149
|
+
|
|
5150
|
+
// Accumulate scaled results
|
|
5151
|
+
v128_t scaled = wasm_i32x4_add(
|
|
5152
|
+
wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)),
|
|
5153
|
+
wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1))
|
|
5154
|
+
);
|
|
5155
|
+
|
|
5156
|
+
isum_vec = wasm_i32x4_add(isum_vec, scaled);
|
|
5157
|
+
q8 += 32;
|
|
5158
|
+
shift += 2;
|
|
5159
|
+
}
|
|
5160
|
+
q2 += 32;
|
|
5161
|
+
|
|
5162
|
+
// Horizontal sum of isum_vec
|
|
5163
|
+
isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1));
|
|
5164
|
+
isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2));
|
|
5165
|
+
isum += wasm_i32x4_extract_lane(isum_vec, 0);
|
|
5166
|
+
}
|
|
5167
|
+
|
|
5168
|
+
const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
5169
|
+
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
5170
|
+
sumf += dall * isum - dmin * summs;
|
|
5171
|
+
}
|
|
5172
|
+
|
|
5173
|
+
*s = sumf;
|
|
5174
|
+
|
|
4463
5175
|
#elif defined __riscv_v_intrinsic
|
|
4464
5176
|
|
|
4465
5177
|
float sumf = 0;
|
|
@@ -4594,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4594
5306
|
vector signed int vsumi6 = v0;
|
|
4595
5307
|
vector signed int vsumi7 = v0;
|
|
4596
5308
|
|
|
4597
|
-
const uint8_t *
|
|
4598
|
-
const int8_t *
|
|
5309
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
5310
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4599
5311
|
|
|
4600
5312
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
4601
5313
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -4679,9 +5391,6 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4679
5391
|
|
|
4680
5392
|
#elif defined __loongarch_asx
|
|
4681
5393
|
|
|
4682
|
-
const __m256i m3 = __lasx_xvreplgr2vr_b(3);
|
|
4683
|
-
const __m128i m4 = __lsx_vreplgr2vr_b(0xF);
|
|
4684
|
-
|
|
4685
5394
|
__m256 acc = (__m256)__lasx_xvldi(0);
|
|
4686
5395
|
|
|
4687
5396
|
for (int i = 0; i < nb; ++i) {
|
|
@@ -4689,21 +5398,18 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4689
5398
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4690
5399
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
4691
5400
|
|
|
4692
|
-
const uint8_t *
|
|
4693
|
-
const int8_t *
|
|
5401
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
5402
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4694
5403
|
|
|
4695
|
-
const __m128i
|
|
4696
|
-
const __m128i
|
|
4697
|
-
const
|
|
4698
|
-
const __m256i mins = lasx_ext8_16(mins8);
|
|
5404
|
+
const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
|
|
5405
|
+
const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
|
|
5406
|
+
const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4));
|
|
4699
5407
|
const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
|
|
4700
5408
|
|
|
4701
5409
|
acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
|
|
4702
5410
|
|
|
4703
|
-
const
|
|
4704
|
-
const
|
|
4705
|
-
const __m128i h_scales = lasx_extracti128(all_scales, 1);
|
|
4706
|
-
const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
|
|
5411
|
+
const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
|
|
5412
|
+
const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
|
|
4707
5413
|
|
|
4708
5414
|
__m256i sumi = __lasx_xvldi(0);
|
|
4709
5415
|
|
|
@@ -4716,20 +5422,20 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4716
5422
|
const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
4717
5423
|
const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
4718
5424
|
|
|
4719
|
-
const __m256i q2_0 =
|
|
4720
|
-
const __m256i q2_1 =
|
|
4721
|
-
const __m256i q2_2 =
|
|
4722
|
-
const __m256i q2_3 =
|
|
5425
|
+
const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3);
|
|
5426
|
+
const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3);
|
|
5427
|
+
const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3);
|
|
5428
|
+
const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6);
|
|
4723
5429
|
|
|
4724
|
-
__m256i p0 =
|
|
4725
|
-
__m256i p1 =
|
|
4726
|
-
__m256i p2 =
|
|
4727
|
-
__m256i p3 =
|
|
5430
|
+
__m256i p0 = lasx_madd_h_b(q2_0, q8_0);
|
|
5431
|
+
__m256i p1 = lasx_madd_h_b(q2_1, q8_1);
|
|
5432
|
+
__m256i p2 = lasx_madd_h_b(q2_2, q8_2);
|
|
5433
|
+
__m256i p3 = lasx_madd_h_b(q2_3, q8_3);
|
|
4728
5434
|
|
|
4729
|
-
p0 = lasx_madd_h(
|
|
4730
|
-
p1 = lasx_madd_h(
|
|
4731
|
-
p2 = lasx_madd_h(
|
|
4732
|
-
p3 = lasx_madd_h(
|
|
5435
|
+
p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0);
|
|
5436
|
+
p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1);
|
|
5437
|
+
p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2);
|
|
5438
|
+
p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3);
|
|
4733
5439
|
|
|
4734
5440
|
p0 = __lasx_xvadd_w(p0, p1);
|
|
4735
5441
|
p2 = __lasx_xvadd_w(p2, p3);
|
|
@@ -4786,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4786
5492
|
#endif
|
|
4787
5493
|
}
|
|
4788
5494
|
|
|
4789
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float *
|
|
5495
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
4790
5496
|
assert(n % QK_K == 0);
|
|
4791
5497
|
assert(nrc == 1);
|
|
4792
5498
|
UNUSED(nrc);
|
|
@@ -4797,12 +5503,187 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4797
5503
|
const uint32_t kmask1 = 0x03030303;
|
|
4798
5504
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
|
4799
5505
|
|
|
4800
|
-
const block_q3_K *
|
|
4801
|
-
const block_q8_K *
|
|
5506
|
+
const block_q3_K * GGML_RESTRICT x = vx;
|
|
5507
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
4802
5508
|
|
|
4803
5509
|
const int nb = n / QK_K;
|
|
4804
5510
|
|
|
4805
|
-
#
|
|
5511
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
5512
|
+
|
|
5513
|
+
uint32_t aux[3];
|
|
5514
|
+
uint32_t utmp[4];
|
|
5515
|
+
|
|
5516
|
+
const int8_t m32 = 32;
|
|
5517
|
+
const int vector_length = svcntb()*8;
|
|
5518
|
+
const svuint8_t m3b_sv = svdup_n_u8(0x3);
|
|
5519
|
+
const svint32_t vzero_sv = svdup_n_s32(0);
|
|
5520
|
+
|
|
5521
|
+
const svuint8_t m0_sv = svdup_n_u8(1);
|
|
5522
|
+
const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
|
|
5523
|
+
const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
|
|
5524
|
+
const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
|
|
5525
|
+
|
|
5526
|
+
float sum = 0;
|
|
5527
|
+
|
|
5528
|
+
for (int i = 0; i < nb; ++i) {
|
|
5529
|
+
|
|
5530
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5531
|
+
|
|
5532
|
+
const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
|
|
5533
|
+
const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
|
|
5534
|
+
const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
|
|
5535
|
+
|
|
5536
|
+
// Set up scales
|
|
5537
|
+
memcpy(aux, x[i].scales, 12);
|
|
5538
|
+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
|
5539
|
+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
|
5540
|
+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
|
5541
|
+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
|
5542
|
+
|
|
5543
|
+
int8_t * scale = (int8_t *)utmp;
|
|
5544
|
+
|
|
5545
|
+
for (int j = 0; j < 16; ++j) scale[j] -= m32;
|
|
5546
|
+
|
|
5547
|
+
switch (vector_length) {
|
|
5548
|
+
case 128:
|
|
5549
|
+
{
|
|
5550
|
+
svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
|
|
5551
|
+
svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
|
|
5552
|
+
svuint8_t q3h_sv;
|
|
5553
|
+
|
|
5554
|
+
svint32_t sumi1_1 = svdup_n_s32(0);
|
|
5555
|
+
svint8_t q3bytes_sv;
|
|
5556
|
+
|
|
5557
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
|
5558
|
+
|
|
5559
|
+
const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
|
|
5560
|
+
const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
|
|
5561
|
+
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5562
|
+
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5563
|
+
|
|
5564
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
|
|
5565
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5566
|
+
|
|
5567
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
|
|
5568
|
+
|
|
5569
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
|
|
5570
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5571
|
+
|
|
5572
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
|
|
5573
|
+
|
|
5574
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5575
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5576
|
+
|
|
5577
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
|
|
5578
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5579
|
+
|
|
5580
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
|
|
5581
|
+
|
|
5582
|
+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
|
|
5583
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5584
|
+
|
|
5585
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
|
|
5586
|
+
|
|
5587
|
+
|
|
5588
|
+
scale += 4;
|
|
5589
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5590
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5591
|
+
|
|
5592
|
+
q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
|
|
5593
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5594
|
+
|
|
5595
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
|
|
5596
|
+
|
|
5597
|
+
q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
|
|
5598
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5599
|
+
|
|
5600
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
|
|
5601
|
+
|
|
5602
|
+
|
|
5603
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5604
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
|
|
5605
|
+
|
|
5606
|
+
q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
|
|
5607
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5608
|
+
|
|
5609
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
|
|
5610
|
+
|
|
5611
|
+
q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
|
|
5612
|
+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5613
|
+
|
|
5614
|
+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
|
|
5615
|
+
|
|
5616
|
+
if (j == 0) {
|
|
5617
|
+
qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
|
|
5618
|
+
qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
|
|
5619
|
+
}
|
|
5620
|
+
|
|
5621
|
+
scale += 4;
|
|
5622
|
+
}
|
|
5623
|
+
|
|
5624
|
+
sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
|
|
5625
|
+
} break;
|
|
5626
|
+
case 256:
|
|
5627
|
+
case 512:
|
|
5628
|
+
{
|
|
5629
|
+
svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
|
|
5630
|
+
svuint8_t q3h_sv;
|
|
5631
|
+
|
|
5632
|
+
svint32_t sumi1_1 = svdup_n_s32(0);
|
|
5633
|
+
svint8_t q3bytes_sv;
|
|
5634
|
+
|
|
5635
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
|
5636
|
+
|
|
5637
|
+
const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
|
|
5638
|
+
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5639
|
+
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5640
|
+
|
|
5641
|
+
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
|
|
5642
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5643
|
+
|
|
5644
|
+
|
|
5645
|
+
svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
|
|
5646
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
|
|
5647
|
+
|
|
5648
|
+
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
|
|
5649
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5650
|
+
|
|
5651
|
+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
|
|
5652
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
|
|
5653
|
+
|
|
5654
|
+
scale += 4;
|
|
5655
|
+
q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5656
|
+
q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
|
|
5657
|
+
|
|
5658
|
+
q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
|
|
5659
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5660
|
+
|
|
5661
|
+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
|
|
5662
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
|
|
5663
|
+
|
|
5664
|
+
q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
|
|
5665
|
+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
|
|
5666
|
+
|
|
5667
|
+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
|
|
5668
|
+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
|
|
5669
|
+
|
|
5670
|
+
if (j == 0) {
|
|
5671
|
+
qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
|
|
5672
|
+
}
|
|
5673
|
+
|
|
5674
|
+
scale += 4;
|
|
5675
|
+
}
|
|
5676
|
+
|
|
5677
|
+
sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
|
|
5678
|
+
} break;
|
|
5679
|
+
default:
|
|
5680
|
+
assert(false && "Unsupported vector length");
|
|
5681
|
+
break;
|
|
5682
|
+
}
|
|
5683
|
+
}
|
|
5684
|
+
*s = sum;
|
|
5685
|
+
|
|
5686
|
+
#elif __ARM_NEON
|
|
4806
5687
|
|
|
4807
5688
|
uint32_t aux[3];
|
|
4808
5689
|
uint32_t utmp[4];
|
|
@@ -4824,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4824
5705
|
|
|
4825
5706
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4826
5707
|
|
|
4827
|
-
const uint8_t *
|
|
4828
|
-
const uint8_t *
|
|
4829
|
-
const int8_t *
|
|
5708
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
5709
|
+
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
|
|
5710
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4830
5711
|
|
|
4831
5712
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
|
4832
5713
|
|
|
@@ -4910,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
4910
5791
|
|
|
4911
5792
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
4912
5793
|
|
|
4913
|
-
const uint8_t *
|
|
4914
|
-
const int8_t *
|
|
5794
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
5795
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
4915
5796
|
|
|
4916
5797
|
// Set up scales
|
|
4917
5798
|
memcpy(aux, x[i].scales, 12);
|
|
@@ -5015,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5015
5896
|
|
|
5016
5897
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5017
5898
|
|
|
5018
|
-
const uint8_t *
|
|
5019
|
-
const int8_t *
|
|
5899
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
5900
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5020
5901
|
|
|
5021
5902
|
// Set up scales
|
|
5022
5903
|
aux = (const uint32_t *)x[i].scales;
|
|
@@ -5142,6 +6023,94 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5142
6023
|
|
|
5143
6024
|
*s = hsum_float_8(acc);
|
|
5144
6025
|
|
|
6026
|
+
#elif defined __wasm_simd128__
|
|
6027
|
+
int8_t aux8[QK_K];
|
|
6028
|
+
float sums[8] = {0};
|
|
6029
|
+
uint32_t auxs[4];
|
|
6030
|
+
|
|
6031
|
+
float sumf = 0;
|
|
6032
|
+
for (int i = 0; i < nb; ++i) {
|
|
6033
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6034
|
+
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
6035
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6036
|
+
|
|
6037
|
+
// Process blocks with SIMD
|
|
6038
|
+
int8_t * a = aux8;
|
|
6039
|
+
uint8_t m = 1;
|
|
6040
|
+
for (int j = 0; j < QK_K; j += 128) {
|
|
6041
|
+
for (int shift = 0; shift <= 6; shift += 2) {
|
|
6042
|
+
v128_t v_m = wasm_i8x16_splat(m);
|
|
6043
|
+
for (int l = 0; l < 32; l += 16) {
|
|
6044
|
+
v128_t v_q3 = wasm_v128_load(q3 + l);
|
|
6045
|
+
v128_t v_shift = wasm_i8x16_shr(v_q3, shift);
|
|
6046
|
+
v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03));
|
|
6047
|
+
|
|
6048
|
+
v128_t v_hm = wasm_v128_load(hm + l);
|
|
6049
|
+
v128_t v_mask = wasm_v128_and(v_hm, v_m);
|
|
6050
|
+
v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0));
|
|
6051
|
+
|
|
6052
|
+
v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask)));
|
|
6053
|
+
wasm_v128_store(a + l, v_low2);
|
|
6054
|
+
}
|
|
6055
|
+
a += 32;
|
|
6056
|
+
m <<= 1;
|
|
6057
|
+
}
|
|
6058
|
+
q3 += 32;
|
|
6059
|
+
}
|
|
6060
|
+
|
|
6061
|
+
// Extract scales
|
|
6062
|
+
memcpy(auxs, x[i].scales, 12);
|
|
6063
|
+
uint32_t tmp = auxs[2];
|
|
6064
|
+
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
6065
|
+
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
6066
|
+
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
6067
|
+
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
6068
|
+
const int8_t * scales = (const int8_t *)auxs;
|
|
6069
|
+
|
|
6070
|
+
// SIMD dot product with register accumulators
|
|
6071
|
+
v128_t v_acc0 = wasm_i32x4_splat(0);
|
|
6072
|
+
v128_t v_acc1 = wasm_i32x4_splat(0);
|
|
6073
|
+
a = aux8;
|
|
6074
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
|
6075
|
+
const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32);
|
|
6076
|
+
|
|
6077
|
+
// Process 16 elements per iteration
|
|
6078
|
+
for (int k = 0; k < 2; ++k) {
|
|
6079
|
+
const v128_t v_q8 = wasm_i16x8_load8x8(q8);
|
|
6080
|
+
const v128_t v_a = wasm_i16x8_load8x8(a);
|
|
6081
|
+
|
|
6082
|
+
v128_t v_prod = wasm_i16x8_mul(v_q8, v_a);
|
|
6083
|
+
v_prod = wasm_i16x8_mul(v_prod, v_scale);
|
|
6084
|
+
|
|
6085
|
+
v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod));
|
|
6086
|
+
v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod));
|
|
6087
|
+
|
|
6088
|
+
q8 += 8;
|
|
6089
|
+
a += 8;
|
|
6090
|
+
}
|
|
6091
|
+
}
|
|
6092
|
+
|
|
6093
|
+
// Accumulate results
|
|
6094
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
6095
|
+
const v128_t v_d = wasm_f32x4_splat(d);
|
|
6096
|
+
v128_t v_sum = wasm_f32x4_add(
|
|
6097
|
+
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
|
|
6098
|
+
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d)
|
|
6099
|
+
);
|
|
6100
|
+
|
|
6101
|
+
// Accumulate into sums vector
|
|
6102
|
+
wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum));
|
|
6103
|
+
}
|
|
6104
|
+
|
|
6105
|
+
// Horizontal sum
|
|
6106
|
+
v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4));
|
|
6107
|
+
sumf = wasm_f32x4_extract_lane(v_sum, 0) +
|
|
6108
|
+
wasm_f32x4_extract_lane(v_sum, 1) +
|
|
6109
|
+
wasm_f32x4_extract_lane(v_sum, 2) +
|
|
6110
|
+
wasm_f32x4_extract_lane(v_sum, 3);
|
|
6111
|
+
|
|
6112
|
+
*s = sumf;
|
|
6113
|
+
|
|
5145
6114
|
#elif defined __riscv_v_intrinsic
|
|
5146
6115
|
|
|
5147
6116
|
uint32_t aux[3];
|
|
@@ -5150,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5150
6119
|
float sumf = 0;
|
|
5151
6120
|
for (int i = 0; i < nb; ++i) {
|
|
5152
6121
|
|
|
5153
|
-
const uint8_t *
|
|
5154
|
-
const uint8_t *
|
|
5155
|
-
const int8_t *
|
|
6122
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6123
|
+
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
|
|
6124
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5156
6125
|
|
|
5157
6126
|
memcpy(aux, x[i].scales, 12);
|
|
5158
6127
|
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
|
@@ -5292,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5292
6261
|
vector signed int vsumi6 = v0;
|
|
5293
6262
|
vector signed int vsumi7 = v0;
|
|
5294
6263
|
|
|
5295
|
-
const uint8_t *
|
|
5296
|
-
const int8_t *
|
|
6264
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6265
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5297
6266
|
|
|
5298
6267
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
5299
6268
|
__builtin_prefetch(q3, 0, 1);
|
|
@@ -5397,8 +6366,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5397
6366
|
|
|
5398
6367
|
#elif defined __loongarch_asx
|
|
5399
6368
|
|
|
5400
|
-
const __m256i m3 = __lasx_xvreplgr2vr_b(3);
|
|
5401
|
-
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
|
5402
6369
|
const __m128i m32 = __lsx_vreplgr2vr_b(32);
|
|
5403
6370
|
|
|
5404
6371
|
__m256 acc = (__m256)__lasx_xvldi(0);
|
|
@@ -5408,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5408
6375
|
for (int i = 0; i < nb; ++i) {
|
|
5409
6376
|
|
|
5410
6377
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5411
|
-
const uint8_t *
|
|
5412
|
-
const int8_t *
|
|
6378
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6379
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5413
6380
|
// Set up scales
|
|
5414
6381
|
memcpy(aux, x[i].scales, 12);
|
|
5415
6382
|
__m128i scales128 = lsx_set_w(
|
|
@@ -5418,10 +6385,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5418
6385
|
(aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
|
|
5419
6386
|
(aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
|
|
5420
6387
|
scales128 = __lsx_vsub_b(scales128, m32);
|
|
5421
|
-
|
|
5422
|
-
const
|
|
5423
|
-
const
|
|
5424
|
-
const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
|
|
6388
|
+
|
|
6389
|
+
const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
|
|
6390
|
+
const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
|
|
5425
6391
|
|
|
5426
6392
|
// high bit
|
|
5427
6393
|
const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
|
|
@@ -5429,35 +6395,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5429
6395
|
// integer accumulator
|
|
5430
6396
|
__m256i sumi = __lasx_xvldi(0);
|
|
5431
6397
|
|
|
5432
|
-
int bit = 0;
|
|
5433
|
-
int is = 0;
|
|
5434
|
-
__m256i xvbit;
|
|
5435
|
-
|
|
5436
|
-
|
|
5437
6398
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
5438
6399
|
// load low 2 bits
|
|
5439
6400
|
const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
|
|
5440
6401
|
|
|
5441
|
-
xvbit = __lasx_xvreplgr2vr_h(bit);
|
|
5442
6402
|
// prepare low and high bits
|
|
5443
|
-
const __m256i q3l_0 =
|
|
5444
|
-
const __m256i
|
|
5445
|
-
|
|
5446
|
-
|
|
5447
|
-
|
|
5448
|
-
const __m256i
|
|
5449
|
-
const __m256i
|
|
5450
|
-
|
|
5451
|
-
|
|
5452
|
-
|
|
5453
|
-
const __m256i
|
|
5454
|
-
const __m256i
|
|
5455
|
-
++bit;
|
|
5456
|
-
|
|
5457
|
-
xvbit = __lasx_xvreplgr2vr_h(bit);
|
|
5458
|
-
const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
|
|
5459
|
-
const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
|
5460
|
-
++bit;
|
|
6403
|
+
const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3);
|
|
6404
|
+
const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3);
|
|
6405
|
+
const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3);
|
|
6406
|
+
const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6);
|
|
6407
|
+
const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2);
|
|
6408
|
+
const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2);
|
|
6409
|
+
const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2);
|
|
6410
|
+
const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2);
|
|
6411
|
+
const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0);
|
|
6412
|
+
const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1);
|
|
6413
|
+
const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2);
|
|
6414
|
+
const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3);
|
|
5461
6415
|
|
|
5462
6416
|
// load Q8 quants
|
|
5463
6417
|
const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
@@ -5465,29 +6419,16 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5465
6419
|
const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
5466
6420
|
const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
5467
6421
|
|
|
5468
|
-
|
|
5469
|
-
|
|
5470
|
-
|
|
5471
|
-
__m256i
|
|
5472
|
-
__m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
|
|
5473
|
-
__m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2);
|
|
5474
|
-
__m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3);
|
|
5475
|
-
|
|
5476
|
-
__m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
|
|
5477
|
-
__m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
|
|
5478
|
-
__m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2);
|
|
5479
|
-
__m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3);
|
|
5480
|
-
|
|
5481
|
-
p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
|
|
5482
|
-
p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
|
|
5483
|
-
p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
|
|
5484
|
-
p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
|
|
6422
|
+
__m256i p16_0 = lasx_madd_h_b(q8_0, q3_0);
|
|
6423
|
+
__m256i p16_1 = lasx_madd_h_b(q8_1, q3_1);
|
|
6424
|
+
__m256i p16_2 = lasx_madd_h_b(q8_2, q3_2);
|
|
6425
|
+
__m256i p16_3 = lasx_madd_h_b(q8_3, q3_3);
|
|
5485
6426
|
|
|
5486
6427
|
// multiply with scales
|
|
5487
|
-
p16_0 = lasx_madd_h(
|
|
5488
|
-
p16_1 = lasx_madd_h(
|
|
5489
|
-
p16_2 = lasx_madd_h(
|
|
5490
|
-
p16_3 = lasx_madd_h(
|
|
6428
|
+
p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
|
|
6429
|
+
p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
|
|
6430
|
+
p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
|
|
6431
|
+
p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
|
|
5491
6432
|
|
|
5492
6433
|
// accumulate
|
|
5493
6434
|
p16_0 = __lasx_xvadd_w(p16_0, p16_1);
|
|
@@ -5495,7 +6436,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5495
6436
|
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
|
|
5496
6437
|
}
|
|
5497
6438
|
// multiply with block scale and accumulate
|
|
5498
|
-
acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc)
|
|
6439
|
+
acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
|
|
5499
6440
|
}
|
|
5500
6441
|
|
|
5501
6442
|
*s = hsum_float_8(acc);
|
|
@@ -5520,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5520
6461
|
|
|
5521
6462
|
float sumf = 0;
|
|
5522
6463
|
for (int i = 0; i < nb; ++i) {
|
|
5523
|
-
const uint8_t *
|
|
5524
|
-
const uint8_t *
|
|
5525
|
-
const int8_t *
|
|
6464
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
6465
|
+
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
6466
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5526
6467
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
5527
|
-
int8_t *
|
|
6468
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
5528
6469
|
uint8_t m = 1;
|
|
5529
6470
|
for (int j = 0; j < QK_K; j += 128) {
|
|
5530
6471
|
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
@@ -5567,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5567
6508
|
|
|
5568
6509
|
}
|
|
5569
6510
|
|
|
5570
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float *
|
|
6511
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
5571
6512
|
assert(n % QK_K == 0);
|
|
5572
6513
|
assert(nrc == 1);
|
|
5573
6514
|
UNUSED(nrc);
|
|
@@ -5575,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5575
6516
|
UNUSED(by);
|
|
5576
6517
|
UNUSED(bs);
|
|
5577
6518
|
|
|
5578
|
-
const block_q4_K *
|
|
5579
|
-
const block_q8_K *
|
|
6519
|
+
const block_q4_K * GGML_RESTRICT x = vx;
|
|
6520
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
5580
6521
|
|
|
5581
6522
|
const int nb = n / QK_K;
|
|
5582
6523
|
|
|
@@ -5611,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5611
6552
|
|
|
5612
6553
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
5613
6554
|
|
|
5614
|
-
const uint8_t *
|
|
5615
|
-
const int8_t *
|
|
6555
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6556
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5616
6557
|
|
|
5617
6558
|
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
|
5618
6559
|
const svuint8_t m4b = svdup_n_u8(0xf);
|
|
@@ -5667,7 +6608,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5667
6608
|
}
|
|
5668
6609
|
}
|
|
5669
6610
|
*s = sumf;
|
|
5670
|
-
#elif __ARM_NEON
|
|
6611
|
+
#elif defined __ARM_NEON
|
|
5671
6612
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
|
5672
6613
|
const int32x4_t mzero = vdupq_n_s32(0);
|
|
5673
6614
|
|
|
@@ -5699,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5699
6640
|
|
|
5700
6641
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
5701
6642
|
|
|
5702
|
-
const uint8_t *
|
|
5703
|
-
const int8_t *
|
|
6643
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6644
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5704
6645
|
|
|
5705
6646
|
int32_t sumi1 = 0;
|
|
5706
6647
|
int32_t sumi2 = 0;
|
|
@@ -5712,20 +6653,121 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5712
6653
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
|
5713
6654
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
5714
6655
|
|
|
5715
|
-
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
|
5716
|
-
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
|
6656
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
|
6657
|
+
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
|
6658
|
+
|
|
6659
|
+
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
|
6660
|
+
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
|
6661
|
+
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
|
6662
|
+
|
|
6663
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
|
6664
|
+
|
|
6665
|
+
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
|
6666
|
+
}
|
|
6667
|
+
|
|
6668
|
+
sumf += d * (sumi1 + sumi2);
|
|
6669
|
+
|
|
6670
|
+
}
|
|
6671
|
+
|
|
6672
|
+
*s = sumf;
|
|
6673
|
+
|
|
6674
|
+
#elif defined __wasm_simd128__
|
|
6675
|
+
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
6676
|
+
float sumf = 0;
|
|
6677
|
+
|
|
6678
|
+
for (int i = 0; i < nb; ++i) {
|
|
6679
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6680
|
+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
|
|
6681
|
+
|
|
6682
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6683
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6684
|
+
|
|
6685
|
+
// Process scales and mins
|
|
6686
|
+
memcpy(utmp, x[i].scales, 12);
|
|
6687
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
6688
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
|
6689
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
6690
|
+
utmp[2] = uaux;
|
|
6691
|
+
utmp[0] &= kmask1;
|
|
6692
|
+
|
|
6693
|
+
// Sum mins * q8sums
|
|
6694
|
+
int32_t sumi = 0;
|
|
6695
|
+
const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
|
|
6696
|
+
const uint8_t * m = (const uint8_t *)&utmp[2];
|
|
6697
|
+
for (int j = 0; j < 16; j += 2) {
|
|
6698
|
+
sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
|
|
6699
|
+
}
|
|
6700
|
+
sumf -= dmin * sumi;
|
|
6701
|
+
|
|
6702
|
+
int32_t sumi1 = 0;
|
|
6703
|
+
int32_t sumi2 = 0;
|
|
5717
6704
|
|
|
5718
|
-
|
|
5719
|
-
|
|
5720
|
-
|
|
6705
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
|
6706
|
+
// Load 64 4-bit weights (32 bytes)
|
|
6707
|
+
const v128_t q4x0 = wasm_v128_load(q4);
|
|
6708
|
+
const v128_t q4x1 = wasm_v128_load(q4 + 16);
|
|
6709
|
+
q4 += 32;
|
|
5721
6710
|
|
|
5722
|
-
|
|
6711
|
+
// Split into low/high nibbles
|
|
6712
|
+
const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
|
|
6713
|
+
const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
|
|
6714
|
+
const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
|
|
6715
|
+
const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
|
|
6716
|
+
|
|
6717
|
+
// Load 64 8-bit values (64 bytes)
|
|
6718
|
+
const v128_t q8x0 = wasm_v128_load(q8);
|
|
6719
|
+
const v128_t q8x1 = wasm_v128_load(q8 + 16);
|
|
6720
|
+
const v128_t q8x2 = wasm_v128_load(q8 + 32);
|
|
6721
|
+
const v128_t q8x3 = wasm_v128_load(q8 + 48);
|
|
6722
|
+
q8 += 64;
|
|
5723
6723
|
|
|
5724
|
-
|
|
6724
|
+
// Low nibble products
|
|
6725
|
+
v128_t vacc1 = wasm_i32x4_dot_i16x8(
|
|
6726
|
+
wasm_i16x8_extend_low_i8x16(q4l0),
|
|
6727
|
+
wasm_i16x8_extend_low_i8x16(q8x0)
|
|
6728
|
+
);
|
|
6729
|
+
vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
|
|
6730
|
+
wasm_i16x8_extend_high_i8x16(q4l0),
|
|
6731
|
+
wasm_i16x8_extend_high_i8x16(q8x0)
|
|
6732
|
+
));
|
|
6733
|
+
vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
|
|
6734
|
+
wasm_i16x8_extend_low_i8x16(q4l1),
|
|
6735
|
+
wasm_i16x8_extend_low_i8x16(q8x1)
|
|
6736
|
+
));
|
|
6737
|
+
vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
|
|
6738
|
+
wasm_i16x8_extend_high_i8x16(q4l1),
|
|
6739
|
+
wasm_i16x8_extend_high_i8x16(q8x1)
|
|
6740
|
+
));
|
|
6741
|
+
|
|
6742
|
+
// High nibble products
|
|
6743
|
+
v128_t vacc2 = wasm_i32x4_dot_i16x8(
|
|
6744
|
+
wasm_i16x8_extend_low_i8x16(q4h0),
|
|
6745
|
+
wasm_i16x8_extend_low_i8x16(q8x2)
|
|
6746
|
+
);
|
|
6747
|
+
vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
|
|
6748
|
+
wasm_i16x8_extend_high_i8x16(q4h0),
|
|
6749
|
+
wasm_i16x8_extend_high_i8x16(q8x2)
|
|
6750
|
+
));
|
|
6751
|
+
vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
|
|
6752
|
+
wasm_i16x8_extend_low_i8x16(q4h1),
|
|
6753
|
+
wasm_i16x8_extend_low_i8x16(q8x3)
|
|
6754
|
+
));
|
|
6755
|
+
vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
|
|
6756
|
+
wasm_i16x8_extend_high_i8x16(q4h1),
|
|
6757
|
+
wasm_i16x8_extend_high_i8x16(q8x3)
|
|
6758
|
+
));
|
|
6759
|
+
|
|
6760
|
+
// Accumulate scaled results
|
|
6761
|
+
int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
|
|
6762
|
+
wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
|
|
6763
|
+
sumi1 += vacc1_sum * scales[2*j];
|
|
6764
|
+
|
|
6765
|
+
int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
|
|
6766
|
+
wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
|
|
6767
|
+
sumi2 += vacc2_sum * scales[2*j+1];
|
|
5725
6768
|
}
|
|
5726
6769
|
|
|
5727
6770
|
sumf += d * (sumi1 + sumi2);
|
|
5728
|
-
|
|
5729
6771
|
}
|
|
5730
6772
|
|
|
5731
6773
|
*s = sumf;
|
|
@@ -5749,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5749
6791
|
utmp[2] = uaux;
|
|
5750
6792
|
utmp[0] &= kmask1;
|
|
5751
6793
|
|
|
5752
|
-
const uint8_t *
|
|
5753
|
-
const int8_t *
|
|
6794
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6795
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5754
6796
|
|
|
5755
6797
|
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
|
|
5756
6798
|
|
|
@@ -5808,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5808
6850
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
5809
6851
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
5810
6852
|
|
|
5811
|
-
const uint8_t *
|
|
5812
|
-
const int8_t *
|
|
6853
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6854
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5813
6855
|
|
|
5814
6856
|
memcpy(utmp, x[i].scales, 12);
|
|
5815
6857
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -5909,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
5909
6951
|
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
|
5910
6952
|
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
|
5911
6953
|
|
|
5912
|
-
const uint8_t *
|
|
5913
|
-
const int8_t *
|
|
6954
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
6955
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
5914
6956
|
|
|
5915
6957
|
vl = 32;
|
|
5916
6958
|
|
|
@@ -6011,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6011
7053
|
vector signed int vsumi2 = v0;
|
|
6012
7054
|
vector signed int vsumi3 = v0;
|
|
6013
7055
|
|
|
6014
|
-
const uint8_t *
|
|
6015
|
-
const int8_t *
|
|
7056
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
7057
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6016
7058
|
|
|
6017
7059
|
for (int j = 0; j < QK_K/64; j+=2) {
|
|
6018
7060
|
__builtin_prefetch(q4, 0, 1);
|
|
@@ -6087,11 +7129,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6087
7129
|
*s = vec_extract(vsumf0, 0);
|
|
6088
7130
|
|
|
6089
7131
|
#elif defined __loongarch_asx
|
|
6090
|
-
GGML_UNUSED(kmask1);
|
|
6091
|
-
GGML_UNUSED(kmask2);
|
|
6092
|
-
GGML_UNUSED(kmask3);
|
|
6093
|
-
|
|
6094
|
-
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
|
6095
7132
|
|
|
6096
7133
|
__m256 acc = (__m256)__lasx_xvldi(0);
|
|
6097
7134
|
__m128 acc_m = (__m128)__lsx_vldi(0);
|
|
@@ -6108,36 +7145,37 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6108
7145
|
utmp[2] = uaux;
|
|
6109
7146
|
utmp[0] &= kmask1;
|
|
6110
7147
|
|
|
6111
|
-
const uint8_t *
|
|
6112
|
-
const int8_t *
|
|
7148
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
7149
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6113
7150
|
|
|
6114
|
-
const
|
|
7151
|
+
const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
|
|
7152
|
+
const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
|
|
7153
|
+
const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
|
|
6115
7154
|
|
|
6116
7155
|
const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
|
|
6117
7156
|
const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
|
|
6118
|
-
const __m128i prod = lsx_madd_h(
|
|
7157
|
+
const __m128i prod = lsx_madd_h(mins128, q8s);
|
|
6119
7158
|
acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
|
|
6120
7159
|
|
|
6121
|
-
const
|
|
6122
|
-
const __m256i scales = lasx_insertf128(sc128, sc128);
|
|
7160
|
+
const __m256i scales = lasx_insertf128(scales128, scales128);
|
|
6123
7161
|
|
|
6124
7162
|
__m256i sumi = __lasx_xvldi(0);
|
|
6125
7163
|
|
|
6126
7164
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
6127
7165
|
|
|
6128
|
-
const __m256i scale_l =
|
|
6129
|
-
const __m256i scale_h =
|
|
7166
|
+
const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0);
|
|
7167
|
+
const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1);
|
|
6130
7168
|
|
|
6131
7169
|
const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
|
|
6132
|
-
const __m256i q4l =
|
|
6133
|
-
const __m256i q4h =
|
|
7170
|
+
const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf);
|
|
7171
|
+
const __m256i q4h = __lasx_xvsrli_b(q4bits, 4);
|
|
6134
7172
|
|
|
6135
7173
|
const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
6136
|
-
__m256i p16l =
|
|
7174
|
+
__m256i p16l = lasx_madd_h_b(q4l, q8l);
|
|
6137
7175
|
p16l = lasx_madd_h(scale_l, p16l);
|
|
6138
7176
|
|
|
6139
7177
|
const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
6140
|
-
__m256i p16h =
|
|
7178
|
+
__m256i p16h = lasx_madd_h_b(q4h, q8h);
|
|
6141
7179
|
p16h = lasx_madd_h(scale_h, p16h);
|
|
6142
7180
|
const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
|
|
6143
7181
|
|
|
@@ -6154,9 +7192,78 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6154
7192
|
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
|
6155
7193
|
|
|
6156
7194
|
|
|
6157
|
-
|
|
6158
|
-
|
|
6159
|
-
|
|
7195
|
+
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
7196
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
7197
|
+
const uint8x16_t v_lm = vec_splat_u8(0x0F);
|
|
7198
|
+
const int32x4_t v_z = vec_splat_s32(0);
|
|
7199
|
+
|
|
7200
|
+
uint8x16_t v_x[2];
|
|
7201
|
+
int8x16_t v_xl[2];
|
|
7202
|
+
int8x16_t v_y[2];
|
|
7203
|
+
|
|
7204
|
+
float sumf = 0;
|
|
7205
|
+
|
|
7206
|
+
for (int i = 0; i < nb; ++i) {
|
|
7207
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7208
|
+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
7209
|
+
|
|
7210
|
+
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
7211
|
+
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
7212
|
+
const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
|
|
7213
|
+
|
|
7214
|
+
memcpy(utmp, x[i].scales, 12);
|
|
7215
|
+
|
|
7216
|
+
uint32x4_t v_mins8 = { 0 };
|
|
7217
|
+
v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
|
|
7218
|
+
v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
|
|
7219
|
+
|
|
7220
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
7221
|
+
utmp[0] &= kmask1;
|
|
7222
|
+
|
|
7223
|
+
const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
|
|
7224
|
+
|
|
7225
|
+
const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
|
|
7226
|
+
const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
|
|
7227
|
+
const int32x4_t v_mins = v_minso + v_minse;
|
|
7228
|
+
sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
|
|
7229
|
+
|
|
7230
|
+
const uint8_t * scales = (const uint8_t *)utmp;
|
|
7231
|
+
const uint8_t * GGML_RESTRICT x0 = x[i].qs;
|
|
7232
|
+
const int8_t * GGML_RESTRICT y0 = y[i].qs;
|
|
7233
|
+
|
|
7234
|
+
int32_t sumi1 = 0;
|
|
7235
|
+
int32_t sumi2 = 0;
|
|
7236
|
+
|
|
7237
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
|
7238
|
+
v_x[0] = vec_xl(0 , x0);
|
|
7239
|
+
v_x[1] = vec_xl(16, x0);
|
|
7240
|
+
x0 += 32;
|
|
7241
|
+
|
|
7242
|
+
v_y[0] = vec_xl(0 , y0);
|
|
7243
|
+
v_y[1] = vec_xl(16, y0);
|
|
7244
|
+
y0 += 32;
|
|
7245
|
+
|
|
7246
|
+
v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
|
|
7247
|
+
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
|
|
7248
|
+
|
|
7249
|
+
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
7250
|
+
sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
|
|
7251
|
+
|
|
7252
|
+
v_y[0] = vec_xl(0 , y0);
|
|
7253
|
+
v_y[1] = vec_xl(16, y0);
|
|
7254
|
+
y0 += 32;
|
|
7255
|
+
|
|
7256
|
+
v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
|
|
7257
|
+
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
|
|
7258
|
+
|
|
7259
|
+
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
7260
|
+
sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
|
|
7261
|
+
}
|
|
7262
|
+
|
|
7263
|
+
sumf += d * (sumi1 + sumi2);
|
|
7264
|
+
}
|
|
7265
|
+
|
|
7266
|
+
*s = sumf;
|
|
6160
7267
|
#else
|
|
6161
7268
|
|
|
6162
7269
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
@@ -6170,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6170
7277
|
|
|
6171
7278
|
float sumf = 0;
|
|
6172
7279
|
for (int i = 0; i < nb; ++i) {
|
|
6173
|
-
const uint8_t *
|
|
6174
|
-
const int8_t *
|
|
7280
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
7281
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6175
7282
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
6176
|
-
int8_t *
|
|
7283
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
6177
7284
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
6178
7285
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
6179
7286
|
a += 32;
|
|
@@ -6216,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6216
7323
|
#endif
|
|
6217
7324
|
}
|
|
6218
7325
|
|
|
6219
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float *
|
|
7326
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
6220
7327
|
assert(n % QK_K == 0);
|
|
6221
7328
|
assert(nrc == 1);
|
|
6222
7329
|
UNUSED(nrc);
|
|
@@ -6224,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6224
7331
|
UNUSED(by);
|
|
6225
7332
|
UNUSED(bs);
|
|
6226
7333
|
|
|
6227
|
-
const block_q5_K *
|
|
6228
|
-
const block_q8_K *
|
|
7334
|
+
const block_q5_K * GGML_RESTRICT x = vx;
|
|
7335
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
6229
7336
|
|
|
6230
7337
|
const int nb = n / QK_K;
|
|
6231
7338
|
|
|
@@ -6267,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6267
7374
|
|
|
6268
7375
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
6269
7376
|
|
|
6270
|
-
const uint8_t *
|
|
6271
|
-
const uint8_t *
|
|
6272
|
-
const int8_t *
|
|
7377
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7378
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
7379
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6273
7380
|
|
|
6274
7381
|
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
|
|
6275
7382
|
|
|
@@ -6314,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6314
7421
|
float summs = 0.f;
|
|
6315
7422
|
|
|
6316
7423
|
for (int i = 0; i < nb; ++i) {
|
|
6317
|
-
const uint8_t *
|
|
6318
|
-
const int8_t *
|
|
7424
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7425
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6319
7426
|
|
|
6320
7427
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6321
7428
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
@@ -6398,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6398
7505
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6399
7506
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
6400
7507
|
|
|
6401
|
-
const uint8_t *
|
|
6402
|
-
const int8_t *
|
|
7508
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7509
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6403
7510
|
|
|
6404
7511
|
memcpy(utmp, x[i].scales, 12);
|
|
6405
7512
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
@@ -6482,6 +7589,118 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6482
7589
|
|
|
6483
7590
|
*s = hsum_float_8(acc) + summs;
|
|
6484
7591
|
|
|
7592
|
+
#elif defined __wasm_simd128__
|
|
7593
|
+
//const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
7594
|
+
float sumf = 0;
|
|
7595
|
+
|
|
7596
|
+
for (int i = 0; i < nb; ++i) {
|
|
7597
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7598
|
+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
|
|
7599
|
+
|
|
7600
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7601
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
7602
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7603
|
+
|
|
7604
|
+
// Process scales and mins
|
|
7605
|
+
memcpy(utmp, x[i].scales, 12);
|
|
7606
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
7607
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
|
7608
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
7609
|
+
utmp[2] = uaux;
|
|
7610
|
+
utmp[0] &= kmask1;
|
|
7611
|
+
|
|
7612
|
+
// Sum mins * q8sums
|
|
7613
|
+
int32_t sumi_mins = 0;
|
|
7614
|
+
const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
|
|
7615
|
+
const uint8_t * m = (const uint8_t *)&utmp[2];
|
|
7616
|
+
for (int j = 0; j < 16; j += 2) {
|
|
7617
|
+
sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
|
|
7618
|
+
}
|
|
7619
|
+
sumf -= dmin * sumi_mins; // Correct subtraction
|
|
7620
|
+
|
|
7621
|
+
v128_t qh0 = wasm_v128_load(qh);
|
|
7622
|
+
v128_t qh1 = wasm_v128_load(qh + 16);
|
|
7623
|
+
const uint8_t * sc = (const uint8_t *)utmp;
|
|
7624
|
+
|
|
7625
|
+
int32_t sumi = 0;
|
|
7626
|
+
|
|
7627
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
|
7628
|
+
const int shift = j * 2;
|
|
7629
|
+
v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift);
|
|
7630
|
+
v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift);
|
|
7631
|
+
|
|
7632
|
+
v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4);
|
|
7633
|
+
v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3);
|
|
7634
|
+
v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4);
|
|
7635
|
+
v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3);
|
|
7636
|
+
|
|
7637
|
+
v128_t q5_0 = wasm_v128_load(q5);
|
|
7638
|
+
v128_t q5_1 = wasm_v128_load(q5 + 16);
|
|
7639
|
+
q5 += 32;
|
|
7640
|
+
|
|
7641
|
+
v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0);
|
|
7642
|
+
v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0);
|
|
7643
|
+
v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1);
|
|
7644
|
+
v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1);
|
|
7645
|
+
|
|
7646
|
+
v128_t q8_0 = wasm_v128_load(q8);
|
|
7647
|
+
v128_t q8_1 = wasm_v128_load(q8 + 16);
|
|
7648
|
+
v128_t q8_2 = wasm_v128_load(q8 + 32);
|
|
7649
|
+
v128_t q8_3 = wasm_v128_load(q8 + 48);
|
|
7650
|
+
q8 += 64;
|
|
7651
|
+
|
|
7652
|
+
// Process low quants
|
|
7653
|
+
v128_t pl0 = wasm_i32x4_dot_i16x8(
|
|
7654
|
+
wasm_i16x8_extend_low_i8x16(q5l_0),
|
|
7655
|
+
wasm_i16x8_extend_low_i8x16(q8_0)
|
|
7656
|
+
);
|
|
7657
|
+
pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8(
|
|
7658
|
+
wasm_i16x8_extend_high_i8x16(q5l_0),
|
|
7659
|
+
wasm_i16x8_extend_high_i8x16(q8_0)
|
|
7660
|
+
));
|
|
7661
|
+
v128_t pl1 = wasm_i32x4_dot_i16x8(
|
|
7662
|
+
wasm_i16x8_extend_low_i8x16(q5l_1),
|
|
7663
|
+
wasm_i16x8_extend_low_i8x16(q8_1)
|
|
7664
|
+
);
|
|
7665
|
+
pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8(
|
|
7666
|
+
wasm_i16x8_extend_high_i8x16(q5l_1),
|
|
7667
|
+
wasm_i16x8_extend_high_i8x16(q8_1)
|
|
7668
|
+
));
|
|
7669
|
+
v128_t sum_low = wasm_i32x4_add(pl0, pl1);
|
|
7670
|
+
|
|
7671
|
+
// Process high quants
|
|
7672
|
+
v128_t ph0 = wasm_i32x4_dot_i16x8(
|
|
7673
|
+
wasm_i16x8_extend_low_i8x16(q5h_0),
|
|
7674
|
+
wasm_i16x8_extend_low_i8x16(q8_2)
|
|
7675
|
+
);
|
|
7676
|
+
ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8(
|
|
7677
|
+
wasm_i16x8_extend_high_i8x16(q5h_0),
|
|
7678
|
+
wasm_i16x8_extend_high_i8x16(q8_2)
|
|
7679
|
+
));
|
|
7680
|
+
v128_t ph1 = wasm_i32x4_dot_i16x8(
|
|
7681
|
+
wasm_i16x8_extend_low_i8x16(q5h_1),
|
|
7682
|
+
wasm_i16x8_extend_low_i8x16(q8_3)
|
|
7683
|
+
);
|
|
7684
|
+
ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8(
|
|
7685
|
+
wasm_i16x8_extend_high_i8x16(q5h_1),
|
|
7686
|
+
wasm_i16x8_extend_high_i8x16(q8_3)
|
|
7687
|
+
));
|
|
7688
|
+
v128_t sum_high = wasm_i32x4_add(ph0, ph1);
|
|
7689
|
+
|
|
7690
|
+
// Accumulate with scale factors
|
|
7691
|
+
int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) +
|
|
7692
|
+
wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3);
|
|
7693
|
+
int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) +
|
|
7694
|
+
wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3);
|
|
7695
|
+
|
|
7696
|
+
sumi += sl * sc[2*j] + sh * sc[2*j+1];
|
|
7697
|
+
}
|
|
7698
|
+
|
|
7699
|
+
sumf += d * sumi;
|
|
7700
|
+
}
|
|
7701
|
+
|
|
7702
|
+
*s = sumf;
|
|
7703
|
+
|
|
6485
7704
|
#elif defined __riscv_v_intrinsic
|
|
6486
7705
|
|
|
6487
7706
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
@@ -6496,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6496
7715
|
|
|
6497
7716
|
vl = 8;
|
|
6498
7717
|
|
|
6499
|
-
const uint8_t *
|
|
6500
|
-
const uint8_t *
|
|
6501
|
-
const int8_t *
|
|
7718
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7719
|
+
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
7720
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6502
7721
|
|
|
6503
7722
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
6504
7723
|
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
@@ -6637,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6637
7856
|
vector signed int vsumi2 = v0;
|
|
6638
7857
|
vector signed int vsumi3 = v0;
|
|
6639
7858
|
|
|
6640
|
-
const uint8_t *
|
|
6641
|
-
const int8_t *
|
|
7859
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7860
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6642
7861
|
|
|
6643
7862
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
6644
7863
|
__builtin_prefetch(q5, 0, 1);
|
|
@@ -6704,22 +7923,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6704
7923
|
*s = vec_extract(vsumf0, 0);
|
|
6705
7924
|
|
|
6706
7925
|
#elif defined __loongarch_asx
|
|
6707
|
-
GGML_UNUSED(kmask1);
|
|
6708
|
-
GGML_UNUSED(kmask2);
|
|
6709
|
-
GGML_UNUSED(kmask3);
|
|
6710
|
-
|
|
6711
|
-
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
|
6712
|
-
const __m128i mzero = __lsx_vldi(0);
|
|
6713
|
-
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
|
6714
7926
|
|
|
6715
7927
|
__m256 acc = (__m256)__lasx_xvldi(0);
|
|
7928
|
+
__m128 acc_m = (__m128)__lsx_vldi(0);
|
|
6716
7929
|
|
|
6717
|
-
|
|
6718
|
-
|
|
6719
|
-
for (int i = 0; i < nb; ++i) {
|
|
7930
|
+
for (int i = 0; i < nb; ++i) {
|
|
6720
7931
|
|
|
6721
|
-
const uint8_t *
|
|
6722
|
-
const int8_t *
|
|
7932
|
+
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
7933
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6723
7934
|
|
|
6724
7935
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6725
7936
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
@@ -6731,49 +7942,40 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6731
7942
|
utmp[2] = uaux;
|
|
6732
7943
|
utmp[0] &= kmask1;
|
|
6733
7944
|
|
|
6734
|
-
const
|
|
7945
|
+
const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
|
|
7946
|
+
const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
|
|
7947
|
+
const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
|
|
6735
7948
|
|
|
6736
7949
|
const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
|
|
6737
7950
|
const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
|
|
6738
|
-
const __m128i prod = lsx_madd_h(
|
|
6739
|
-
|
|
6740
|
-
summs += dmin * __lsx_vpickve2gr_w(hsum, 0); //TODO check
|
|
7951
|
+
const __m128i prod = lsx_madd_h(mins128, q8s);
|
|
7952
|
+
acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
|
|
6741
7953
|
|
|
6742
|
-
const
|
|
6743
|
-
const __m256i scales = lasx_insertf128(sc128, sc128);
|
|
7954
|
+
const __m256i scales = lasx_insertf128(scales128, scales128);
|
|
6744
7955
|
|
|
6745
7956
|
const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
|
|
6746
|
-
__m256i hmask = mone;
|
|
6747
7957
|
|
|
6748
7958
|
__m256i sumi = __lasx_xvldi(0);
|
|
6749
7959
|
|
|
6750
|
-
int bit = 0;
|
|
6751
|
-
__m256i xvbit;
|
|
6752
|
-
|
|
6753
7960
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
6754
7961
|
|
|
6755
|
-
const __m256i scale_0 =
|
|
6756
|
-
const __m256i scale_1 =
|
|
7962
|
+
const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0);
|
|
7963
|
+
const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1);
|
|
6757
7964
|
|
|
6758
7965
|
const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
|
|
6759
7966
|
|
|
6760
|
-
|
|
6761
|
-
const __m256i
|
|
6762
|
-
const __m256i q5h_0 =
|
|
6763
|
-
const __m256i
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
xvbit = __lasx_xvreplgr2vr_h(bit++);
|
|
6767
|
-
const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
|
|
6768
|
-
const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
|
|
6769
|
-
const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
|
|
6770
|
-
hmask = __lasx_xvslli_h(hmask, 1);
|
|
7967
|
+
const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf);
|
|
7968
|
+
const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4);
|
|
7969
|
+
const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef);
|
|
7970
|
+
const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef);
|
|
7971
|
+
const __m256i q5_0 = __lasx_xvor_v(q5l_0, q5h_0);
|
|
7972
|
+
const __m256i q5_1 = __lasx_xvor_v(q5l_1, q5h_1);
|
|
6771
7973
|
|
|
6772
7974
|
const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
6773
7975
|
const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
6774
7976
|
|
|
6775
|
-
__m256i p16_0 =
|
|
6776
|
-
__m256i p16_1 =
|
|
7977
|
+
__m256i p16_0 = lasx_madd_h_b(q5_0, q8_0);
|
|
7978
|
+
__m256i p16_1 = lasx_madd_h_b(q5_1, q8_1);
|
|
6777
7979
|
|
|
6778
7980
|
p16_0 = lasx_madd_h(scale_0, p16_0);
|
|
6779
7981
|
p16_1 = lasx_madd_h(scale_1, p16_1);
|
|
@@ -6787,8 +7989,98 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6787
7989
|
|
|
6788
7990
|
}
|
|
6789
7991
|
|
|
6790
|
-
|
|
7992
|
+
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8));
|
|
7993
|
+
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
|
|
7994
|
+
|
|
7995
|
+
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
7996
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
7997
|
+
const uint8x16_t v_lm = vec_splat_u8(0x0F);
|
|
7998
|
+
const uint8x16_t v_1m = vec_splat_u8(0x01);
|
|
7999
|
+
const uint8x16_t v_2m = vec_splat_u8(0x02);
|
|
8000
|
+
|
|
8001
|
+
const int32x4_t v_z = vec_splat_s32(0);
|
|
8002
|
+
|
|
8003
|
+
const uchar8x16_t v_minsm = {
|
|
8004
|
+
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
8005
|
+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
|
|
8006
|
+
};
|
|
8007
|
+
|
|
8008
|
+
int8x16_t q5b[4];
|
|
8009
|
+
uint8x16_t q5h[4];
|
|
8010
|
+
|
|
8011
|
+
uint8x16_t v_xl[2];
|
|
8012
|
+
uint8x16_t v_xh[2];
|
|
8013
|
+
int8x16_t v_y[4];
|
|
8014
|
+
|
|
8015
|
+
float sumf = 0;
|
|
8016
|
+
|
|
8017
|
+
for (int i = 0; i < nb; ++i) {
|
|
8018
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
8019
|
+
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
8020
|
+
|
|
8021
|
+
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
8022
|
+
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
8023
|
+
const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
|
|
8024
|
+
|
|
8025
|
+
memcpy(utmp, x[i].scales, 12);
|
|
8026
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
8027
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
|
8028
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
8029
|
+
utmp[2] = uaux;
|
|
8030
|
+
utmp[0] &= kmask1;
|
|
8031
|
+
|
|
8032
|
+
const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
|
|
8033
|
+
const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
|
|
8034
|
+
const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
|
|
8035
|
+
|
|
8036
|
+
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
|
|
8037
|
+
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
|
|
8038
|
+
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
|
|
8039
|
+
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
|
|
8040
|
+
|
|
8041
|
+
const uint8_t * scales = (const uint8_t *)utmp;
|
|
8042
|
+
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
|
|
8043
|
+
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
|
|
8044
|
+
const int8_t * GGML_RESTRICT y0 = y[i].qs;
|
|
8045
|
+
|
|
8046
|
+
v_xh[0] = vec_xl(0 , x0h);
|
|
8047
|
+
v_xh[1] = vec_xl(16, x0h);
|
|
8048
|
+
|
|
8049
|
+
int32_t sumi = 0;
|
|
8050
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
|
8051
|
+
v_xl[0] = vec_xl(0 , x0l);
|
|
8052
|
+
v_xl[1] = vec_xl(16, x0l);
|
|
8053
|
+
x0l += 32;
|
|
8054
|
+
|
|
8055
|
+
v_y[0] = vec_xl(0 , y0);
|
|
8056
|
+
v_y[1] = vec_xl(16, y0);
|
|
8057
|
+
v_y[2] = vec_xl(32, y0);
|
|
8058
|
+
v_y[3] = vec_xl(48, y0);
|
|
8059
|
+
y0 += 64;
|
|
8060
|
+
|
|
8061
|
+
q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
|
|
8062
|
+
q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
|
|
8063
|
+
q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
|
|
8064
|
+
q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
|
|
8065
|
+
v_xh[0] = vec_sr(v_xh[0], 2);
|
|
8066
|
+
v_xh[1] = vec_sr(v_xh[1], 2);
|
|
8067
|
+
|
|
8068
|
+
q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
|
|
8069
|
+
q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
|
|
8070
|
+
q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
|
|
8071
|
+
q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
|
|
8072
|
+
|
|
8073
|
+
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
|
|
8074
|
+
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
|
|
8075
|
+
|
|
8076
|
+
sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
|
|
8077
|
+
sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
|
|
8078
|
+
}
|
|
6791
8079
|
|
|
8080
|
+
sumf += d * sumi - dmin * mins;
|
|
8081
|
+
}
|
|
8082
|
+
|
|
8083
|
+
*s = sumf;
|
|
6792
8084
|
#else
|
|
6793
8085
|
|
|
6794
8086
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
@@ -6802,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6802
8094
|
|
|
6803
8095
|
float sumf = 0;
|
|
6804
8096
|
for (int i = 0; i < nb; ++i) {
|
|
6805
|
-
const uint8_t *
|
|
6806
|
-
const uint8_t *
|
|
6807
|
-
const int8_t *
|
|
8097
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
8098
|
+
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
8099
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6808
8100
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
6809
|
-
int8_t *
|
|
8101
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
6810
8102
|
uint8_t m = 1;
|
|
6811
8103
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
6812
8104
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
@@ -6853,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6853
8145
|
#endif
|
|
6854
8146
|
}
|
|
6855
8147
|
|
|
6856
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float *
|
|
8148
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
6857
8149
|
assert(n % QK_K == 0);
|
|
6858
8150
|
assert(nrc == 1);
|
|
6859
8151
|
UNUSED(nrc);
|
|
@@ -6861,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6861
8153
|
UNUSED(by);
|
|
6862
8154
|
UNUSED(bs);
|
|
6863
8155
|
|
|
6864
|
-
const block_q6_K *
|
|
6865
|
-
const block_q8_K *
|
|
8156
|
+
const block_q6_K * GGML_RESTRICT x = vx;
|
|
8157
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
6866
8158
|
|
|
6867
8159
|
const int nb = n / QK_K;
|
|
6868
8160
|
|
|
@@ -6882,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6882
8174
|
|
|
6883
8175
|
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
6884
8176
|
|
|
6885
|
-
const uint8_t *
|
|
6886
|
-
const uint8_t *
|
|
6887
|
-
const int8_t *
|
|
8177
|
+
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
8178
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8179
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6888
8180
|
|
|
6889
|
-
const int8_t *
|
|
8181
|
+
const int8_t * GGML_RESTRICT scale = x[i].scales;
|
|
6890
8182
|
|
|
6891
8183
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
|
6892
8184
|
const int8x16_t scales = vld1q_s8(scale);
|
|
@@ -6973,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
6973
8265
|
|
|
6974
8266
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
6975
8267
|
|
|
6976
|
-
const uint8_t *
|
|
6977
|
-
const uint8_t *
|
|
6978
|
-
const int8_t *
|
|
8268
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8269
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8270
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
6979
8271
|
|
|
6980
8272
|
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
6981
8273
|
|
|
@@ -7051,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7051
8343
|
|
|
7052
8344
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7053
8345
|
|
|
7054
|
-
const uint8_t *
|
|
7055
|
-
const uint8_t *
|
|
7056
|
-
const int8_t *
|
|
8346
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8347
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8348
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7057
8349
|
|
|
7058
8350
|
// handle the q6_k -32 offset separately using bsums
|
|
7059
8351
|
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
|
|
@@ -7145,6 +8437,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7145
8437
|
|
|
7146
8438
|
*s = hsum_float_8(acc);
|
|
7147
8439
|
|
|
8440
|
+
#elif defined __wasm_simd128__
|
|
8441
|
+
int8_t aux8[QK_K] __attribute__((aligned(16)));
|
|
8442
|
+
int32_t aux32[8] __attribute__((aligned(16))) = {0};
|
|
8443
|
+
float sums[8] __attribute__((aligned(16))) = {0};
|
|
8444
|
+
|
|
8445
|
+
for (int i = 0; i < nb; ++i) {
|
|
8446
|
+
// Unpack 6-bit quantized data into aux8 (unchanged)
|
|
8447
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8448
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8449
|
+
int8_t * a = aux8;
|
|
8450
|
+
for (int j = 0; j < QK_K; j += 128) {
|
|
8451
|
+
for (int l = 0; l < 32; ++l) {
|
|
8452
|
+
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
8453
|
+
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
8454
|
+
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
8455
|
+
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
8456
|
+
}
|
|
8457
|
+
a += 128;
|
|
8458
|
+
q4 += 64;
|
|
8459
|
+
qh += 32;
|
|
8460
|
+
}
|
|
8461
|
+
|
|
8462
|
+
const int8_t * GGML_RESTRICT a_ptr = aux8;
|
|
8463
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8464
|
+
v128_t acc0 = wasm_i32x4_splat(0);
|
|
8465
|
+
v128_t acc1 = wasm_i32x4_splat(0);
|
|
8466
|
+
|
|
8467
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
|
8468
|
+
const int scale = x[i].scales[j];
|
|
8469
|
+
const v128_t vscale = wasm_i32x4_splat(scale);
|
|
8470
|
+
|
|
8471
|
+
// Load 16 elements from a and q8
|
|
8472
|
+
const v128_t a_vec = wasm_v128_load(a_ptr);
|
|
8473
|
+
const v128_t q8_vec = wasm_v128_load(q8);
|
|
8474
|
+
|
|
8475
|
+
// Process low 8 elements
|
|
8476
|
+
v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec);
|
|
8477
|
+
v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec);
|
|
8478
|
+
v128_t prod_low = wasm_i16x8_mul(a_low, q8_low);
|
|
8479
|
+
v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low);
|
|
8480
|
+
v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low);
|
|
8481
|
+
|
|
8482
|
+
// Process high 8 elements
|
|
8483
|
+
v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec);
|
|
8484
|
+
v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec);
|
|
8485
|
+
v128_t prod_high = wasm_i16x8_mul(a_high, q8_high);
|
|
8486
|
+
v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high);
|
|
8487
|
+
v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high);
|
|
8488
|
+
|
|
8489
|
+
// Scale and accumulate
|
|
8490
|
+
prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale);
|
|
8491
|
+
prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale);
|
|
8492
|
+
prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale);
|
|
8493
|
+
prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale);
|
|
8494
|
+
|
|
8495
|
+
acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo));
|
|
8496
|
+
acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi));
|
|
8497
|
+
|
|
8498
|
+
a_ptr += 16;
|
|
8499
|
+
q8 += 16;
|
|
8500
|
+
}
|
|
8501
|
+
|
|
8502
|
+
// Store accumulated results
|
|
8503
|
+
wasm_v128_store(&aux32[0], acc0);
|
|
8504
|
+
wasm_v128_store(&aux32[4], acc1);
|
|
8505
|
+
|
|
8506
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8507
|
+
for (int l = 0; l < 8; ++l) {
|
|
8508
|
+
sums[l] += d * aux32[l];
|
|
8509
|
+
}
|
|
8510
|
+
}
|
|
8511
|
+
|
|
8512
|
+
// Sum final results
|
|
8513
|
+
float sumf = 0;
|
|
8514
|
+
for (int l = 0; l < 8; ++l) {
|
|
8515
|
+
sumf += sums[l];
|
|
8516
|
+
}
|
|
8517
|
+
*s = sumf;
|
|
8518
|
+
|
|
7148
8519
|
#elif defined __riscv_v_intrinsic
|
|
7149
8520
|
|
|
7150
8521
|
float sumf = 0;
|
|
@@ -7152,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7152
8523
|
|
|
7153
8524
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7154
8525
|
|
|
7155
|
-
const uint8_t *
|
|
7156
|
-
const uint8_t *
|
|
7157
|
-
const int8_t *
|
|
8526
|
+
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
8527
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8528
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7158
8529
|
|
|
7159
|
-
const int8_t *
|
|
8530
|
+
const int8_t * GGML_RESTRICT scale = x[i].scales;
|
|
7160
8531
|
|
|
7161
8532
|
size_t vl;
|
|
7162
8533
|
|
|
@@ -7258,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7258
8629
|
vector signed int vsumi6 = v0;
|
|
7259
8630
|
vector signed int vsumi7 = v0;
|
|
7260
8631
|
|
|
7261
|
-
const uint8_t *
|
|
7262
|
-
const uint8_t *
|
|
7263
|
-
const int8_t *
|
|
7264
|
-
const int8_t *
|
|
8632
|
+
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
|
8633
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8634
|
+
const int8_t * GGML_RESTRICT qs = x[i].scales;
|
|
8635
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7265
8636
|
|
|
7266
8637
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
7267
8638
|
__builtin_prefetch(q6, 0, 0);
|
|
@@ -7369,8 +8740,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7369
8740
|
|
|
7370
8741
|
#elif defined __loongarch_asx
|
|
7371
8742
|
|
|
7372
|
-
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
|
7373
|
-
const __m256i m2 = __lasx_xvreplgr2vr_b(3);
|
|
7374
8743
|
const __m256i m32s = __lasx_xvreplgr2vr_b(32);
|
|
7375
8744
|
|
|
7376
8745
|
__m256 acc = (__m256)__lasx_xvldi(0);
|
|
@@ -7379,62 +8748,46 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7379
8748
|
|
|
7380
8749
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
7381
8750
|
|
|
7382
|
-
const uint8_t *
|
|
7383
|
-
const uint8_t *
|
|
7384
|
-
const int8_t *
|
|
8751
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8752
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8753
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7385
8754
|
|
|
7386
|
-
const __m128i
|
|
8755
|
+
const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
|
|
8756
|
+
const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
|
|
8757
|
+
const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
|
|
7387
8758
|
|
|
7388
8759
|
__m256i sumi = __lasx_xvldi(0);
|
|
7389
8760
|
|
|
7390
|
-
int is = 0;
|
|
7391
|
-
|
|
7392
8761
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
7393
8762
|
|
|
7394
|
-
const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0));
|
|
7395
|
-
const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1));
|
|
7396
|
-
const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2));
|
|
7397
|
-
const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3));
|
|
7398
|
-
is += 4;
|
|
7399
|
-
|
|
7400
8763
|
const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
|
|
7401
8764
|
const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
|
|
7402
8765
|
const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
|
|
7403
8766
|
|
|
7404
|
-
const __m256i q4h_0 =
|
|
7405
|
-
const __m256i q4h_1 =
|
|
7406
|
-
const __m256i q4h_2 =
|
|
7407
|
-
const __m256i q4h_3 =
|
|
8767
|
+
const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4);
|
|
8768
|
+
const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2);
|
|
8769
|
+
const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4);
|
|
8770
|
+
const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2);
|
|
7408
8771
|
|
|
7409
|
-
const __m256i q4_0 = __lasx_xvor_v(
|
|
7410
|
-
const __m256i q4_1 = __lasx_xvor_v(
|
|
7411
|
-
const __m256i q4_2 = __lasx_xvor_v(
|
|
7412
|
-
const __m256i q4_3 = __lasx_xvor_v(
|
|
8772
|
+
const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0);
|
|
8773
|
+
const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1);
|
|
8774
|
+
const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2);
|
|
8775
|
+
const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3);
|
|
7413
8776
|
|
|
7414
8777
|
const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
7415
8778
|
const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
7416
8779
|
const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
7417
8780
|
const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
|
|
7418
8781
|
|
|
7419
|
-
__m256i
|
|
7420
|
-
__m256i
|
|
7421
|
-
__m256i
|
|
7422
|
-
__m256i
|
|
7423
|
-
|
|
7424
|
-
__m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
|
|
7425
|
-
__m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
|
|
7426
|
-
__m256i p16_2 = lasx_maddubs_h(q4_2, q8_2);
|
|
7427
|
-
__m256i p16_3 = lasx_maddubs_h(q4_3, q8_3);
|
|
8782
|
+
__m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0);
|
|
8783
|
+
__m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1);
|
|
8784
|
+
__m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2);
|
|
8785
|
+
__m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3);
|
|
7428
8786
|
|
|
7429
|
-
p16_0 =
|
|
7430
|
-
p16_1 =
|
|
7431
|
-
p16_2 =
|
|
7432
|
-
p16_3 =
|
|
7433
|
-
|
|
7434
|
-
p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
|
|
7435
|
-
p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
|
|
7436
|
-
p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2);
|
|
7437
|
-
p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3);
|
|
8787
|
+
p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
|
|
8788
|
+
p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
|
|
8789
|
+
p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
|
|
8790
|
+
p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
|
|
7438
8791
|
|
|
7439
8792
|
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
|
|
7440
8793
|
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
|
|
@@ -7444,7 +8797,130 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7444
8797
|
}
|
|
7445
8798
|
|
|
7446
8799
|
*s = hsum_float_8(acc);
|
|
8800
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
8801
|
+
float sum = 0;
|
|
8802
|
+
|
|
8803
|
+
// Lower 4-bit and upper 2-bit masks
|
|
8804
|
+
const uint8x16_t v_lm = vec_splat_u8(0x0F);
|
|
8805
|
+
const uint8x16_t v_um = vec_splat_u8(0x03);
|
|
8806
|
+
|
|
8807
|
+
const int32x4_t v_z = vec_splat_s32(0);
|
|
8808
|
+
|
|
8809
|
+
int8x16_t q6b[4];
|
|
8810
|
+
uint8x16_t q6h[4];
|
|
8811
|
+
|
|
8812
|
+
uint8x16_t v_xl[4];
|
|
8813
|
+
uint8x16_t v_xh[2];
|
|
8814
|
+
int8x16_t v_y[4];
|
|
8815
|
+
|
|
8816
|
+
for (int i = 0; i < nb; ++i) {
|
|
8817
|
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
8818
|
+
|
|
8819
|
+
const uint8_t * GGML_RESTRICT x0l = x[i].ql;
|
|
8820
|
+
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
|
|
8821
|
+
const int8_t * GGML_RESTRICT y0 = y[i].qs;
|
|
8822
|
+
|
|
8823
|
+
const int8_t * GGML_RESTRICT scale = x[i].scales;
|
|
8824
|
+
|
|
8825
|
+
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
|
8826
|
+
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
|
8827
|
+
|
|
8828
|
+
const int8x16_t v_scale = vec_xl(0, scale);
|
|
8829
|
+
const int16x8_t v_scalel = vec_unpackh(v_scale);
|
|
8830
|
+
const int16x8_t v_scaleh = vec_unpackl(v_scale);
|
|
8831
|
+
|
|
8832
|
+
const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
|
|
8833
|
+
const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
|
|
8834
|
+
const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
|
|
8835
|
+
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
|
|
8836
|
+
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
|
|
7447
8837
|
|
|
8838
|
+
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
|
|
8839
|
+
|
|
8840
|
+
int32_t isum = 0;
|
|
8841
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
|
8842
|
+
// Load model upper 2 bits
|
|
8843
|
+
v_xh[0] = vec_xl(0 , x0h);
|
|
8844
|
+
v_xh[1] = vec_xl(16, x0h);
|
|
8845
|
+
x0h += 32;
|
|
8846
|
+
|
|
8847
|
+
// Load model lower 4 bits
|
|
8848
|
+
v_xl[0] = vec_xl(0 , x0l);
|
|
8849
|
+
v_xl[1] = vec_xl(16, x0l);
|
|
8850
|
+
v_xl[2] = vec_xl(32, x0l);
|
|
8851
|
+
v_xl[3] = vec_xl(48, x0l);
|
|
8852
|
+
x0l += 64;
|
|
8853
|
+
|
|
8854
|
+
// Load activation quants
|
|
8855
|
+
v_y[0] = vec_xl(0 , y0);
|
|
8856
|
+
v_y[1] = vec_xl(16, y0);
|
|
8857
|
+
v_y[2] = vec_xl(32, y0);
|
|
8858
|
+
v_y[3] = vec_xl(48, y0);
|
|
8859
|
+
y0 += 64;
|
|
8860
|
+
|
|
8861
|
+
q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
|
|
8862
|
+
q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
|
|
8863
|
+
uint8x16_t shifted = vec_sr(v_xh[0], 2);
|
|
8864
|
+
q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8865
|
+
shifted = vec_sr(v_xh[1], 2);
|
|
8866
|
+
q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8867
|
+
|
|
8868
|
+
q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
|
|
8869
|
+
q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
|
|
8870
|
+
q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
|
|
8871
|
+
q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
|
|
8872
|
+
|
|
8873
|
+
int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
|
|
8874
|
+
int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
|
|
8875
|
+
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
8876
|
+
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
8877
|
+
|
|
8878
|
+
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
|
|
8879
|
+
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
|
|
8880
|
+
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
|
|
8881
|
+
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
|
|
8882
|
+
|
|
8883
|
+
scale += 4;
|
|
8884
|
+
|
|
8885
|
+
|
|
8886
|
+
// Load activation quants
|
|
8887
|
+
v_y[0] = vec_xl(0 , y0);
|
|
8888
|
+
v_y[1] = vec_xl(16, y0);
|
|
8889
|
+
v_y[2] = vec_xl(32, y0);
|
|
8890
|
+
v_y[3] = vec_xl(48, y0);
|
|
8891
|
+
y0 += 64;
|
|
8892
|
+
|
|
8893
|
+
shifted = vec_sr(v_xh[0], 4);
|
|
8894
|
+
q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8895
|
+
shifted = vec_sr(v_xh[1], 4);
|
|
8896
|
+
q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8897
|
+
shifted = vec_sr(v_xh[0], 6);
|
|
8898
|
+
q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8899
|
+
shifted = vec_sr(v_xh[1], 6);
|
|
8900
|
+
q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
|
|
8901
|
+
|
|
8902
|
+
q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
|
|
8903
|
+
q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
|
|
8904
|
+
q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
|
|
8905
|
+
q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
|
|
8906
|
+
|
|
8907
|
+
summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
|
|
8908
|
+
summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
|
|
8909
|
+
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
8910
|
+
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
8911
|
+
|
|
8912
|
+
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
|
|
8913
|
+
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
|
|
8914
|
+
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
|
|
8915
|
+
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
|
|
8916
|
+
|
|
8917
|
+
scale += 4;
|
|
8918
|
+
}
|
|
8919
|
+
|
|
8920
|
+
sum += d_all * y[i].d * (isum - 32 * mins);
|
|
8921
|
+
}
|
|
8922
|
+
|
|
8923
|
+
*s = sum;
|
|
7448
8924
|
#else
|
|
7449
8925
|
|
|
7450
8926
|
int8_t aux8[QK_K];
|
|
@@ -7455,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
|
7455
8931
|
|
|
7456
8932
|
float sumf = 0;
|
|
7457
8933
|
for (int i = 0; i < nb; ++i) {
|
|
7458
|
-
const uint8_t *
|
|
7459
|
-
const uint8_t *
|
|
7460
|
-
const int8_t *
|
|
8934
|
+
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
8935
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
8936
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7461
8937
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
7462
|
-
int8_t *
|
|
8938
|
+
int8_t * GGML_RESTRICT a = aux8;
|
|
7463
8939
|
for (int j = 0; j < QK_K; j += 128) {
|
|
7464
8940
|
for (int l = 0; l < 32; ++l) {
|
|
7465
8941
|
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
@@ -7527,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
|
7527
9003
|
};
|
|
7528
9004
|
#endif
|
|
7529
9005
|
|
|
7530
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float *
|
|
9006
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
7531
9007
|
assert(n % QK_K == 0);
|
|
7532
9008
|
assert(nrc == 1);
|
|
7533
9009
|
UNUSED(nrc);
|
|
@@ -7535,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7535
9011
|
UNUSED(by);
|
|
7536
9012
|
UNUSED(bs);
|
|
7537
9013
|
|
|
7538
|
-
const block_iq2_xxs *
|
|
7539
|
-
const block_q8_K *
|
|
9014
|
+
const block_iq2_xxs * GGML_RESTRICT x = vx;
|
|
9015
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
7540
9016
|
|
|
7541
9017
|
const int nb = n / QK_K;
|
|
7542
9018
|
|
|
@@ -7554,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7554
9030
|
float sumf = 0;
|
|
7555
9031
|
for (int i = 0; i < nb; ++i) {
|
|
7556
9032
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7557
|
-
const uint16_t *
|
|
7558
|
-
const int8_t *
|
|
9033
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9034
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7559
9035
|
float sumf1 = 0, sumf2 = 0;
|
|
7560
9036
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
7561
9037
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
@@ -7591,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7591
9067
|
__m256 accumf = _mm256_setzero_ps();
|
|
7592
9068
|
for (int i = 0; i < nb; ++i) {
|
|
7593
9069
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7594
|
-
const uint16_t *
|
|
7595
|
-
const int8_t *
|
|
9070
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9071
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7596
9072
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
7597
9073
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
7598
9074
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -7632,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7632
9108
|
__m256 accumf = _mm256_setzero_ps();
|
|
7633
9109
|
for (int i = 0; i < nb; ++i) {
|
|
7634
9110
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7635
|
-
const uint16_t *
|
|
7636
|
-
const int8_t *
|
|
9111
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9112
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7637
9113
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
7638
9114
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
7639
9115
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
@@ -7697,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7697
9173
|
vector signed int vsumi2 = v0;
|
|
7698
9174
|
vector signed int vsumi3 = v0;
|
|
7699
9175
|
|
|
7700
|
-
const uint16_t *
|
|
7701
|
-
const int8_t *
|
|
9176
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9177
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7702
9178
|
|
|
7703
9179
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
7704
9180
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -7774,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7774
9250
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
7775
9251
|
for (int i = 0; i < nb; ++i) {
|
|
7776
9252
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7777
|
-
const uint16_t *
|
|
7778
|
-
const int8_t *
|
|
9253
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9254
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7779
9255
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
7780
9256
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
7781
9257
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -7805,7 +9281,57 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7805
9281
|
}
|
|
7806
9282
|
|
|
7807
9283
|
*s = 0.125f * hsum_float_8(accumf);
|
|
7808
|
-
|
|
9284
|
+
//#elif defined(__VXE__) || defined(__VXE2__)
|
|
9285
|
+
// const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
9286
|
+
//
|
|
9287
|
+
// uint32_t aux32[4];
|
|
9288
|
+
// const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
9289
|
+
//
|
|
9290
|
+
// float sumf = 0;
|
|
9291
|
+
//
|
|
9292
|
+
// for (int i = 0; i < nb; ++i) {
|
|
9293
|
+
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9294
|
+
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9295
|
+
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9296
|
+
//
|
|
9297
|
+
// float sumf1 = 0, sumf2 = 0;
|
|
9298
|
+
//
|
|
9299
|
+
// for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
|
|
9300
|
+
// int8x16_t q8b0 = vec_xl( 0, q8);
|
|
9301
|
+
// int8x16_t qb81 = vec_xl(16, q8);
|
|
9302
|
+
// int8x16_t q8b2 = vec_xl(32, q8);
|
|
9303
|
+
// int8x16_t q8b3 = vec_xl(48, q8);
|
|
9304
|
+
// q8 += 64;
|
|
9305
|
+
//
|
|
9306
|
+
// memcpy(aux32, q2, 4 * sizeof(uint32_t));
|
|
9307
|
+
// q2 += 8;
|
|
9308
|
+
//
|
|
9309
|
+
// int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
|
|
9310
|
+
// int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
|
|
9311
|
+
// int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
|
|
9312
|
+
// int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
|
|
9313
|
+
//
|
|
9314
|
+
// int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
|
|
9315
|
+
// int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
|
|
9316
|
+
// int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
|
|
9317
|
+
// int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
|
|
9318
|
+
//
|
|
9319
|
+
// q2u0 = vec_mul(q2u0, q2s0);
|
|
9320
|
+
// q2u1 = vec_mul(q2u1, q2s1);
|
|
9321
|
+
// q2u2 = vec_mul(q2u2, q2s2);
|
|
9322
|
+
// q2u3 = vec_mul(q2u3, q2s3);
|
|
9323
|
+
//
|
|
9324
|
+
// const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
|
|
9325
|
+
// const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
|
|
9326
|
+
//
|
|
9327
|
+
// sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
|
|
9328
|
+
// sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
|
|
9329
|
+
// }
|
|
9330
|
+
//
|
|
9331
|
+
// sumf += d * (sumf1 + sumf2);
|
|
9332
|
+
// }
|
|
9333
|
+
//
|
|
9334
|
+
// *s = 0.25f * sumf;
|
|
7809
9335
|
#else
|
|
7810
9336
|
|
|
7811
9337
|
uint32_t aux32[2];
|
|
@@ -7814,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7814
9340
|
float sumf = 0.f;
|
|
7815
9341
|
for (int i = 0; i < nb; ++i) {
|
|
7816
9342
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7817
|
-
const uint16_t *
|
|
7818
|
-
const int8_t *
|
|
9343
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9344
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7819
9345
|
int32_t bsum = 0;
|
|
7820
9346
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
7821
9347
|
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
@@ -7838,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
7838
9364
|
#endif
|
|
7839
9365
|
}
|
|
7840
9366
|
|
|
7841
|
-
void ggml_vec_dot_iq2_xs_q8_K(int n, float *
|
|
9367
|
+
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
7842
9368
|
assert(n % QK_K == 0);
|
|
7843
9369
|
assert(nrc == 1);
|
|
7844
9370
|
UNUSED(nrc);
|
|
@@ -7846,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
7846
9372
|
UNUSED(by);
|
|
7847
9373
|
UNUSED(bs);
|
|
7848
9374
|
|
|
7849
|
-
const block_iq2_xs *
|
|
7850
|
-
const block_q8_K *
|
|
9375
|
+
const block_iq2_xs * GGML_RESTRICT x = vx;
|
|
9376
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
7851
9377
|
|
|
7852
9378
|
const int nb = n / QK_K;
|
|
7853
9379
|
|
|
@@ -7864,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
7864
9390
|
float sumf = 0;
|
|
7865
9391
|
for (int i = 0; i < nb; ++i) {
|
|
7866
9392
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7867
|
-
const uint16_t *
|
|
7868
|
-
const int8_t *
|
|
9393
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9394
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7869
9395
|
const uint8x8_t scales8 = vld1_u8(x[i].scales);
|
|
7870
9396
|
const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
|
|
7871
9397
|
const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
|
|
@@ -7942,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
7942
9468
|
__m256 accumf = _mm256_setzero_ps();
|
|
7943
9469
|
for (int i = 0; i < nb; ++i) {
|
|
7944
9470
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
7945
|
-
const uint16_t *
|
|
7946
|
-
const int8_t *
|
|
9471
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9472
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
7947
9473
|
|
|
7948
9474
|
memcpy(&aux64, x[i].scales, 8);
|
|
7949
9475
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
|
@@ -8063,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8063
9589
|
__m256 accumf = _mm256_setzero_ps();
|
|
8064
9590
|
for (int i = 0; i < nb; ++i) {
|
|
8065
9591
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8066
|
-
const uint16_t *
|
|
8067
|
-
const int8_t *
|
|
9592
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9593
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8068
9594
|
|
|
8069
9595
|
memcpy(&aux64, x[i].scales, 8);
|
|
8070
9596
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
|
@@ -8218,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8218
9744
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
8219
9745
|
for (int i = 0; i < nb; ++i) {
|
|
8220
9746
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8221
|
-
const uint16_t *
|
|
8222
|
-
const int8_t *
|
|
9747
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9748
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8223
9749
|
|
|
8224
9750
|
memcpy(&aux64, x[i].scales, 8);
|
|
8225
9751
|
__m128i stmp = __lsx_vreplgr2vr_d(aux64);
|
|
@@ -8316,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8316
9842
|
vector signed int vsumi2 = v0;
|
|
8317
9843
|
vector signed int vsumi3 = v0;
|
|
8318
9844
|
|
|
8319
|
-
const uint16_t *
|
|
8320
|
-
const uint8_t *
|
|
8321
|
-
const int8_t *
|
|
9845
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9846
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
9847
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8322
9848
|
|
|
8323
9849
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
8324
9850
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -8388,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8388
9914
|
float sumf = 0.f;
|
|
8389
9915
|
for (int i = 0; i < nb; ++i) {
|
|
8390
9916
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8391
|
-
const uint16_t *
|
|
8392
|
-
const uint8_t *
|
|
8393
|
-
const int8_t *
|
|
9917
|
+
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
9918
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
9919
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8394
9920
|
int32_t bsum = 0;
|
|
8395
9921
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
8396
9922
|
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
@@ -8423,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8423
9949
|
#endif
|
|
8424
9950
|
}
|
|
8425
9951
|
|
|
8426
|
-
void ggml_vec_dot_iq2_s_q8_K(int n, float *
|
|
9952
|
+
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
8427
9953
|
assert(n % QK_K == 0);
|
|
8428
9954
|
assert(nrc == 1);
|
|
8429
9955
|
UNUSED(nrc);
|
|
@@ -8431,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8431
9957
|
UNUSED(by);
|
|
8432
9958
|
UNUSED(bs);
|
|
8433
9959
|
|
|
8434
|
-
const block_iq2_s *
|
|
8435
|
-
const block_q8_K *
|
|
9960
|
+
const block_iq2_s * GGML_RESTRICT x = vx;
|
|
9961
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
8436
9962
|
|
|
8437
9963
|
const int nb = n / QK_K;
|
|
8438
9964
|
|
|
@@ -8458,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8458
9984
|
|
|
8459
9985
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8460
9986
|
|
|
8461
|
-
const uint8_t *
|
|
8462
|
-
const uint8_t *
|
|
8463
|
-
const uint16_t *
|
|
8464
|
-
const int8_t *
|
|
9987
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
9988
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
9989
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
9990
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8465
9991
|
|
|
8466
9992
|
int sumi1 = 0, sumi2 = 0;
|
|
8467
9993
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -8532,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8532
10058
|
__m256 accumf = _mm256_setzero_ps();
|
|
8533
10059
|
for (int i = 0; i < nb; ++i) {
|
|
8534
10060
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8535
|
-
const uint8_t *
|
|
8536
|
-
const uint8_t *
|
|
8537
|
-
const uint16_t *
|
|
8538
|
-
const int8_t *
|
|
10061
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10062
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10063
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10064
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8539
10065
|
|
|
8540
10066
|
memcpy(&aux64, x[i].scales, 8);
|
|
8541
10067
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
@@ -8605,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8605
10131
|
__m256 accumf = _mm256_setzero_ps();
|
|
8606
10132
|
for (int i = 0; i < nb; ++i) {
|
|
8607
10133
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8608
|
-
const uint8_t *
|
|
8609
|
-
const uint8_t *
|
|
8610
|
-
const uint16_t *
|
|
8611
|
-
const int8_t *
|
|
10134
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10135
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10136
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10137
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8612
10138
|
|
|
8613
10139
|
memcpy(&aux64, x[i].scales, 8);
|
|
8614
10140
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
@@ -8703,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8703
10229
|
vector signed int vsumi2 = v0;
|
|
8704
10230
|
vector signed int vsumi3 = v0;
|
|
8705
10231
|
|
|
8706
|
-
const uint8_t *
|
|
8707
|
-
const uint8_t *
|
|
8708
|
-
const uint16_t *
|
|
8709
|
-
const uint8_t *
|
|
8710
|
-
const int8_t *
|
|
10232
|
+
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
10233
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10234
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10235
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
10236
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8711
10237
|
|
|
8712
10238
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
8713
10239
|
__builtin_prefetch(q2, 0, 1);
|
|
@@ -8804,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8804
10330
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
8805
10331
|
for (int i = 0; i < nb; ++i) {
|
|
8806
10332
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8807
|
-
const uint8_t *
|
|
8808
|
-
const uint8_t *
|
|
8809
|
-
const uint16_t *
|
|
8810
|
-
const int8_t *
|
|
10333
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10334
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10335
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
10336
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8811
10337
|
|
|
8812
10338
|
__m128i tmp1;
|
|
8813
10339
|
memcpy(&aux64, x[i].scales, 8);
|
|
@@ -8901,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
8901
10427
|
|
|
8902
10428
|
}
|
|
8903
10429
|
|
|
8904
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float *
|
|
10430
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
8905
10431
|
assert(n % QK_K == 0);
|
|
8906
10432
|
assert(nrc == 1);
|
|
8907
10433
|
UNUSED(nrc);
|
|
@@ -8909,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8909
10435
|
UNUSED(by);
|
|
8910
10436
|
UNUSED(bs);
|
|
8911
10437
|
|
|
8912
|
-
const block_iq3_xxs *
|
|
8913
|
-
const block_q8_K *
|
|
10438
|
+
const block_iq3_xxs * GGML_RESTRICT x = vx;
|
|
10439
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
8914
10440
|
|
|
8915
10441
|
const int nb = n / QK_K;
|
|
8916
10442
|
|
|
@@ -8926,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8926
10452
|
float sumf = 0;
|
|
8927
10453
|
for (int i = 0; i < nb; ++i) {
|
|
8928
10454
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8929
|
-
const uint8_t *
|
|
8930
|
-
const uint8_t *
|
|
8931
|
-
const int8_t *
|
|
10455
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10456
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10457
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8932
10458
|
float sumf1 = 0, sumf2 = 0;
|
|
8933
10459
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
8934
10460
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
|
@@ -8964,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
8964
10490
|
__m256 accumf = _mm256_setzero_ps();
|
|
8965
10491
|
for (int i = 0; i < nb; ++i) {
|
|
8966
10492
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
8967
|
-
const uint8_t *
|
|
8968
|
-
const uint8_t *
|
|
8969
|
-
const int8_t *
|
|
10493
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10494
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10495
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
8970
10496
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
8971
10497
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
8972
10498
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9009,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9009
10535
|
__m256 accumf = _mm256_setzero_ps();
|
|
9010
10536
|
for (int i = 0; i < nb; ++i) {
|
|
9011
10537
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9012
|
-
const uint8_t *
|
|
9013
|
-
const uint8_t *
|
|
9014
|
-
const int8_t *
|
|
10538
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10539
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10540
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9015
10541
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
9016
10542
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
9017
10543
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
@@ -9078,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9078
10604
|
vector signed int vsumi2 = v0;
|
|
9079
10605
|
vector signed int vsumi3 = v0;
|
|
9080
10606
|
|
|
9081
|
-
const uint8_t *
|
|
9082
|
-
const uint32_t *
|
|
9083
|
-
const int8_t *
|
|
10607
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10608
|
+
const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
|
10609
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9084
10610
|
|
|
9085
10611
|
#pragma GCC unroll 1
|
|
9086
10612
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
@@ -9152,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9152
10678
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
9153
10679
|
for (int i = 0; i < nb; ++i) {
|
|
9154
10680
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9155
|
-
const uint8_t *
|
|
9156
|
-
const uint8_t *
|
|
9157
|
-
const int8_t *
|
|
10681
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10682
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10683
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9158
10684
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
9159
10685
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
9160
10686
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9197,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9197
10723
|
float sumf = 0.f;
|
|
9198
10724
|
for (int i = 0; i < nb; ++i) {
|
|
9199
10725
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9200
|
-
const uint8_t *
|
|
9201
|
-
const uint8_t *
|
|
9202
|
-
const int8_t *
|
|
10726
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
10727
|
+
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
10728
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9203
10729
|
int32_t bsum = 0;
|
|
9204
10730
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
9205
10731
|
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
@@ -9224,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
|
9224
10750
|
#endif
|
|
9225
10751
|
}
|
|
9226
10752
|
|
|
9227
|
-
void ggml_vec_dot_iq3_s_q8_K (int n, float *
|
|
10753
|
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
9228
10754
|
assert(n % QK_K == 0);
|
|
9229
10755
|
assert(nrc == 1);
|
|
9230
10756
|
UNUSED(nrc);
|
|
@@ -9232,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9232
10758
|
UNUSED(by);
|
|
9233
10759
|
UNUSED(bs);
|
|
9234
10760
|
|
|
9235
|
-
const block_iq3_s *
|
|
9236
|
-
const block_q8_K *
|
|
10761
|
+
const block_iq3_s * GGML_RESTRICT x = vx;
|
|
10762
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
9237
10763
|
|
|
9238
10764
|
const int nb = n / QK_K;
|
|
9239
10765
|
|
|
@@ -9270,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9270
10796
|
float sumf = 0;
|
|
9271
10797
|
for (int i = 0; i < nb; ++i) {
|
|
9272
10798
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9273
|
-
const uint8_t *
|
|
9274
|
-
const uint8_t *
|
|
9275
|
-
const uint16_t *
|
|
9276
|
-
const int8_t *
|
|
10799
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10800
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10801
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
10802
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9277
10803
|
|
|
9278
10804
|
memcpy(scales32, x[i].scales, 4);
|
|
9279
10805
|
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
|
|
@@ -9352,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9352
10878
|
__m256 accumf = _mm256_setzero_ps();
|
|
9353
10879
|
for (int i = 0; i < nb; ++i) {
|
|
9354
10880
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9355
|
-
const uint8_t *
|
|
9356
|
-
const uint8_t *
|
|
9357
|
-
const uint16_t *
|
|
9358
|
-
const int8_t *
|
|
10881
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10882
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10883
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
10884
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9359
10885
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
9360
10886
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
9361
10887
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9437,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9437
10963
|
__m256 accumf = _mm256_setzero_ps();
|
|
9438
10964
|
for (int i = 0; i < nb; ++i) {
|
|
9439
10965
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9440
|
-
const uint8_t *
|
|
9441
|
-
const uint8_t *
|
|
9442
|
-
const uint16_t *
|
|
9443
|
-
const int8_t *
|
|
10966
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
10967
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
10968
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
10969
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9444
10970
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
9445
10971
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
9446
10972
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
@@ -9538,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9538
11064
|
vector float vyd = vec_splats(y[i].d);
|
|
9539
11065
|
vector float vd = vec_mul(vxd, vyd);
|
|
9540
11066
|
|
|
9541
|
-
const uint8_t *
|
|
9542
|
-
const uint8_t *
|
|
9543
|
-
const uint16_t *
|
|
9544
|
-
const uint8_t *
|
|
9545
|
-
const int8_t *
|
|
11067
|
+
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
11068
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
11069
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
|
|
11070
|
+
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
11071
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9546
11072
|
|
|
9547
11073
|
vector signed int vsumi0 = v0;
|
|
9548
11074
|
vector signed int vsumi1 = v0;
|
|
@@ -9649,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9649
11175
|
__m256 accumf = (__m256)__lasx_xvldi(0);
|
|
9650
11176
|
for (int i = 0; i < nb; ++i) {
|
|
9651
11177
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9652
|
-
const uint8_t *
|
|
9653
|
-
const uint8_t *
|
|
9654
|
-
const uint16_t *
|
|
9655
|
-
const int8_t *
|
|
11178
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
11179
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
11180
|
+
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
11181
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9656
11182
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
9657
11183
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
9658
11184
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
@@ -9710,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
|
9710
11236
|
float sumf = 0.f;
|
|
9711
11237
|
for (int i = 0; i < nb; ++i) {
|
|
9712
11238
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
9713
|
-
const uint8_t *
|
|
9714
|
-
const uint8_t *
|
|
9715
|
-
const uint8_t *
|
|
9716
|
-
const int8_t *
|
|
11239
|
+
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
11240
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
11241
|
+
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
11242
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
9717
11243
|
int32_t bsum = 0;
|
|
9718
11244
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
9719
11245
|
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
@@ -9759,17 +11285,13 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
|
9759
11285
|
}
|
|
9760
11286
|
#elif defined(__loongarch_asx)
|
|
9761
11287
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
9762
|
-
const __m256i
|
|
9763
|
-
const __m256i
|
|
9764
|
-
|
|
9765
|
-
tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy);
|
|
9766
|
-
tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy);
|
|
9767
|
-
tmp3 = __lasx_xvadd_h(tmp1, tmp2);
|
|
9768
|
-
return __lasx_xvsat_h(tmp3, 15);
|
|
11288
|
+
const __m256i a = __lasx_xvmulwev_h_b(x, y);
|
|
11289
|
+
const __m256i b = __lasx_xvmulwod_h_b(x, y);
|
|
11290
|
+
return __lasx_xvadd_h(a, b);
|
|
9769
11291
|
}
|
|
9770
11292
|
#endif
|
|
9771
11293
|
|
|
9772
|
-
void ggml_vec_dot_iq1_s_q8_K (int n, float *
|
|
11294
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
9773
11295
|
assert(n % QK_K == 0);
|
|
9774
11296
|
assert(nrc == 1);
|
|
9775
11297
|
UNUSED(nrc);
|
|
@@ -9777,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
9777
11299
|
UNUSED(by);
|
|
9778
11300
|
UNUSED(bs);
|
|
9779
11301
|
|
|
9780
|
-
const block_iq1_s *
|
|
9781
|
-
const block_q8_K *
|
|
11302
|
+
const block_iq1_s * GGML_RESTRICT x = vx;
|
|
11303
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
9782
11304
|
|
|
9783
11305
|
const int nb = n / QK_K;
|
|
9784
11306
|
|
|
@@ -9840,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
9840
11362
|
__m256i sumi = _mm256_setzero_si256();
|
|
9841
11363
|
int sumi1 = 0;
|
|
9842
11364
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
11365
|
+
#ifdef __BMI2__
|
|
11366
|
+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
|
|
11367
|
+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
|
|
11368
|
+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
|
11369
|
+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
|
11370
|
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
|
11371
|
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
|
11372
|
+
#else
|
|
9843
11373
|
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
|
9844
11374
|
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
|
9845
11375
|
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
|
9846
11376
|
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
|
11377
|
+
#endif
|
|
9847
11378
|
qs += 8;
|
|
9848
11379
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
9849
11380
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
@@ -9936,10 +11467,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
9936
11467
|
vector signed int vsumi3 = vec_splats((int32_t)0);
|
|
9937
11468
|
vector signed int vsumi8 = vec_splats((int32_t)0);
|
|
9938
11469
|
|
|
9939
|
-
const uint8_t *
|
|
9940
|
-
const uint16_t *
|
|
9941
|
-
const int8_t *
|
|
9942
|
-
const int16_t *
|
|
11470
|
+
const uint8_t * GGML_RESTRICT q1 = x[i].qs;
|
|
11471
|
+
const uint16_t * GGML_RESTRICT qh = x[i].qh;
|
|
11472
|
+
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
11473
|
+
const int16_t * GGML_RESTRICT qs = y[i].bsums;
|
|
9943
11474
|
|
|
9944
11475
|
for (int j = 0; j < QK_K/32; j += 2) {
|
|
9945
11476
|
__builtin_prefetch(q1, 0, 1);
|
|
@@ -10100,7 +11631,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10100
11631
|
#endif
|
|
10101
11632
|
}
|
|
10102
11633
|
|
|
10103
|
-
void ggml_vec_dot_iq1_m_q8_K (int n, float *
|
|
11634
|
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10104
11635
|
assert(n % QK_K == 0);
|
|
10105
11636
|
assert(nrc == 1);
|
|
10106
11637
|
UNUSED(nrc);
|
|
@@ -10108,8 +11639,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10108
11639
|
UNUSED(by);
|
|
10109
11640
|
UNUSED(bs);
|
|
10110
11641
|
|
|
10111
|
-
const block_iq1_m *
|
|
10112
|
-
const block_q8_K *
|
|
11642
|
+
const block_iq1_m * GGML_RESTRICT x = vx;
|
|
11643
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
10113
11644
|
|
|
10114
11645
|
const int nb = n / QK_K;
|
|
10115
11646
|
|
|
@@ -10189,6 +11720,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10189
11720
|
|
|
10190
11721
|
const __m256i mask = _mm256_set1_epi16(0x7);
|
|
10191
11722
|
const __m256i mone = _mm256_set1_epi16(1);
|
|
11723
|
+
const __m256i mone8 = _mm256_set1_epi8(1);
|
|
11724
|
+
const __m256i mtwo8 = _mm256_set1_epi8(2);
|
|
11725
|
+
// VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
|
|
11726
|
+
const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
|
|
10192
11727
|
|
|
10193
11728
|
__m256 accum1 = _mm256_setzero_ps();
|
|
10194
11729
|
__m256 accum2 = _mm256_setzero_ps();
|
|
@@ -10200,10 +11735,33 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10200
11735
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
10201
11736
|
|
|
10202
11737
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
11738
|
+
// Extract 3-bit scales (16 values)
|
|
11739
|
+
__m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
|
|
11740
|
+
scales = _mm256_srlv_epi64(scales, scales_shift);
|
|
11741
|
+
scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
|
|
11742
|
+
|
|
11743
|
+
// Indices to repeat each scale 8 times.
|
|
11744
|
+
__m256i scales_idx1 = _mm256_set1_epi16(0x0100);
|
|
11745
|
+
__m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
|
|
10203
11746
|
|
|
10204
11747
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
10205
11748
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
10206
11749
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
11750
|
+
#ifdef __BMI2__
|
|
11751
|
+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
|
|
11752
|
+
| _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
|
|
11753
|
+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
|
|
11754
|
+
| _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
|
|
11755
|
+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
|
11756
|
+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
|
11757
|
+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
|
11758
|
+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
|
11759
|
+
|
|
11760
|
+
// Convert signs to bytes 0x81 (negative) or 0x01 (positive)
|
|
11761
|
+
const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
|
|
11762
|
+
const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
|
|
11763
|
+
const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
|
|
11764
|
+
#else
|
|
10207
11765
|
const __m256i q1b_1 = _mm256_set_epi64x(
|
|
10208
11766
|
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
|
10209
11767
|
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
|
@@ -10212,11 +11770,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10212
11770
|
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
|
10213
11771
|
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
|
10214
11772
|
);
|
|
10215
|
-
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10216
|
-
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10217
|
-
|
|
10218
|
-
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
|
10219
|
-
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
|
10220
11773
|
|
|
10221
11774
|
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
10222
11775
|
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
@@ -10226,15 +11779,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10226
11779
|
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
10227
11780
|
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
10228
11781
|
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
11782
|
+
#endif
|
|
11783
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
11784
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
10229
11785
|
|
|
10230
|
-
const __m256i
|
|
10231
|
-
const __m256i
|
|
11786
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
|
11787
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
|
11788
|
+
const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
|
|
11789
|
+
const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
|
|
10232
11790
|
|
|
10233
|
-
__m256i scale1 =
|
|
10234
|
-
__m256i scale2 =
|
|
11791
|
+
__m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
|
|
11792
|
+
__m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
|
|
11793
|
+
|
|
11794
|
+
scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
|
|
11795
|
+
scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
|
|
10235
11796
|
|
|
10236
|
-
scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
|
|
10237
|
-
scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
|
|
10238
11797
|
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
|
10239
11798
|
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
|
10240
11799
|
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
|
@@ -10390,7 +11949,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
|
10390
11949
|
#endif
|
|
10391
11950
|
}
|
|
10392
11951
|
|
|
10393
|
-
void ggml_vec_dot_iq4_nl_q8_0(int n, float *
|
|
11952
|
+
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10394
11953
|
assert(nrc == 1);
|
|
10395
11954
|
UNUSED(nrc);
|
|
10396
11955
|
UNUSED(bx);
|
|
@@ -10399,8 +11958,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
10399
11958
|
assert(n % QK4_NL == 0);
|
|
10400
11959
|
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
|
10401
11960
|
|
|
10402
|
-
const block_iq4_nl *
|
|
10403
|
-
const block_q8_0 *
|
|
11961
|
+
const block_iq4_nl * GGML_RESTRICT x = vx;
|
|
11962
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
10404
11963
|
|
|
10405
11964
|
const int nb = n / QK4_NL;
|
|
10406
11965
|
|
|
@@ -10570,6 +12129,27 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
10570
12129
|
|
|
10571
12130
|
sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
|
|
10572
12131
|
|
|
12132
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
12133
|
+
const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
|
|
12134
|
+
const uint8x16_t v_m = vec_splat_u8(0x0F);
|
|
12135
|
+
|
|
12136
|
+
for (; ib < nb; ++ib) {
|
|
12137
|
+
const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
|
|
12138
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
|
|
12139
|
+
|
|
12140
|
+
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
|
12141
|
+
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
|
12142
|
+
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
|
12143
|
+
|
|
12144
|
+
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
|
|
12145
|
+
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
|
|
12146
|
+
|
|
12147
|
+
const int8x16_t v_yl = vec_xl(0 , y0->qs);
|
|
12148
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
|
12149
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
12150
|
+
|
|
12151
|
+
sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
|
12152
|
+
}
|
|
10573
12153
|
#endif
|
|
10574
12154
|
for (; ib < nb; ++ib) {
|
|
10575
12155
|
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
|
@@ -10583,7 +12163,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
|
10583
12163
|
*s = sumf;
|
|
10584
12164
|
}
|
|
10585
12165
|
|
|
10586
|
-
void ggml_vec_dot_iq4_xs_q8_K(int n, float *
|
|
12166
|
+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
10587
12167
|
assert(nrc == 1);
|
|
10588
12168
|
UNUSED(nrc);
|
|
10589
12169
|
UNUSED(bx);
|
|
@@ -10591,8 +12171,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
10591
12171
|
UNUSED(bs);
|
|
10592
12172
|
assert(n % QK_K == 0);
|
|
10593
12173
|
|
|
10594
|
-
const block_iq4_xs *
|
|
10595
|
-
const block_q8_K *
|
|
12174
|
+
const block_iq4_xs * GGML_RESTRICT x = vx;
|
|
12175
|
+
const block_q8_K * GGML_RESTRICT y = vy;
|
|
10596
12176
|
|
|
10597
12177
|
const int nb = n / QK_K;
|
|
10598
12178
|
|
|
@@ -10749,9 +12329,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
10749
12329
|
|
|
10750
12330
|
uint16_t h = x[ibl].scales_h;
|
|
10751
12331
|
|
|
10752
|
-
const uint8_t *
|
|
10753
|
-
const uint8_t *
|
|
10754
|
-
const int8_t *
|
|
12332
|
+
const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
|
|
12333
|
+
const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
|
|
12334
|
+
const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
|
|
10755
12335
|
|
|
10756
12336
|
for (int ib = 0; ib < QK_K/64; ib ++ ) {
|
|
10757
12337
|
__builtin_prefetch(q4, 0, 1);
|
|
@@ -10815,67 +12395,31 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
10815
12395
|
#elif defined(__loongarch_asx)
|
|
10816
12396
|
|
|
10817
12397
|
const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
|
|
10818
|
-
const __m128i m4b = __lsx_vreplgr2vr_b(0x0f);
|
|
10819
12398
|
|
|
10820
12399
|
__m256 accum = (__m256)__lasx_xvldi(0);
|
|
10821
|
-
__m256i tmp1;
|
|
10822
|
-
__m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask;
|
|
10823
12400
|
|
|
10824
|
-
mask_8f = __lsx_vreplgr2vr_b(0x8f);
|
|
10825
12401
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
10826
12402
|
const uint8_t * qs = x[ibl].qs;
|
|
10827
12403
|
const int8_t * q8 = y[ibl].qs;
|
|
10828
12404
|
uint16_t sh = x[ibl].scales_h;
|
|
10829
12405
|
__m256i sumi1 = __lasx_xvldi(0);
|
|
10830
12406
|
__m256i sumi2 = __lasx_xvldi(0);
|
|
10831
|
-
__m128i zero = __lsx_vldi(0);
|
|
10832
12407
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
10833
|
-
const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0);
|
|
10834
|
-
const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0);
|
|
12408
|
+
const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
|
|
12409
|
+
const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
|
|
10835
12410
|
const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
|
|
10836
12411
|
const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
|
|
10837
|
-
|
|
10838
|
-
|
|
10839
|
-
|
|
10840
|
-
|
|
10841
|
-
tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
|
|
10842
|
-
|
|
10843
|
-
tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f);
|
|
10844
|
-
tmp0 = __lsx_vori_b(tmp2, 0x10);
|
|
10845
|
-
mask = __lsx_vsle_b(zero, tmp2);
|
|
10846
|
-
tmp4 = __lsx_vand_v(tmp0, mask);
|
|
10847
|
-
tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
|
|
10848
|
-
|
|
10849
|
-
const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4);
|
|
10850
|
-
|
|
10851
|
-
tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f);
|
|
10852
|
-
tmp0 = __lsx_vori_b(tmp2, 0x10);
|
|
10853
|
-
mask = __lsx_vsle_b(zero, tmp2);
|
|
10854
|
-
tmp3 = __lsx_vand_v(tmp0, mask);
|
|
10855
|
-
tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
|
|
10856
|
-
|
|
10857
|
-
tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f);
|
|
10858
|
-
tmp0 = __lsx_vori_b(tmp2, 0x10);
|
|
10859
|
-
mask = __lsx_vsle_b(zero, tmp2);
|
|
10860
|
-
tmp4 = __lsx_vand_v(tmp0, mask);
|
|
10861
|
-
tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
|
|
10862
|
-
|
|
10863
|
-
const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4);
|
|
10864
|
-
|
|
12412
|
+
const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)),
|
|
12413
|
+
__lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf)));
|
|
12414
|
+
const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)),
|
|
12415
|
+
__lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf)));
|
|
10865
12416
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
10866
12417
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
10867
12418
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
|
10868
12419
|
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
|
10869
12420
|
sh >>= 4;
|
|
10870
|
-
__m256i
|
|
10871
|
-
|
|
10872
|
-
tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1);
|
|
10873
|
-
tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1);
|
|
10874
|
-
const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6);
|
|
10875
|
-
tmp1 = __lasx_xvreplgr2vr_h(ls2);
|
|
10876
|
-
tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1);
|
|
10877
|
-
tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1);
|
|
10878
|
-
const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6);
|
|
12421
|
+
const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1));
|
|
12422
|
+
const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2));
|
|
10879
12423
|
sumi1 = __lasx_xvadd_w(p_1, sumi1);
|
|
10880
12424
|
sumi2 = __lasx_xvadd_w(p_2, sumi2);
|
|
10881
12425
|
}
|
|
@@ -10884,6 +12428,56 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
10884
12428
|
}
|
|
10885
12429
|
|
|
10886
12430
|
*s = hsum_float_8(accum);
|
|
12431
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
12432
|
+
const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
|
|
12433
|
+
const uint8x16_t v_m = vec_splat_u8(0x0F);
|
|
12434
|
+
|
|
12435
|
+
float sumf = 0;
|
|
12436
|
+
|
|
12437
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
12438
|
+
const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
|
|
12439
|
+
const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
|
|
12440
|
+
|
|
12441
|
+
uint16_t h = x[ibl].scales_h;
|
|
12442
|
+
|
|
12443
|
+
int sumi1 = 0, sumi2 = 0;
|
|
12444
|
+
for (int ib = 0; ib < QK_K/64; ++ib) {
|
|
12445
|
+
const uint8x16_t v_x0 = vec_xl(0 , q4);
|
|
12446
|
+
const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
|
|
12447
|
+
q4 += 32;
|
|
12448
|
+
|
|
12449
|
+
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
|
12450
|
+
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
|
12451
|
+
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
|
12452
|
+
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
|
12453
|
+
|
|
12454
|
+
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
|
|
12455
|
+
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
|
|
12456
|
+
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
|
|
12457
|
+
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
|
|
12458
|
+
|
|
12459
|
+
const int8x16_t v_y0 = vec_xl( 0, q8);
|
|
12460
|
+
const int8x16_t v_y1 = vec_xl(16, q8);
|
|
12461
|
+
const int8x16_t v_y2 = vec_xl(32, q8);
|
|
12462
|
+
const int8x16_t v_y3 = vec_xl(48, q8);
|
|
12463
|
+
q8 += 64;
|
|
12464
|
+
|
|
12465
|
+
int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
|
|
12466
|
+
int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
|
|
12467
|
+
|
|
12468
|
+
int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
|
|
12469
|
+
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
|
12470
|
+
|
|
12471
|
+
h >>= 4;
|
|
12472
|
+
|
|
12473
|
+
sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
|
|
12474
|
+
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
|
|
12475
|
+
}
|
|
12476
|
+
|
|
12477
|
+
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
|
12478
|
+
}
|
|
12479
|
+
|
|
12480
|
+
*s = sumf;
|
|
10887
12481
|
|
|
10888
12482
|
#else
|
|
10889
12483
|
float sumf = 0;
|
|
@@ -10922,12 +12516,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
|
10922
12516
|
|
|
10923
12517
|
// ============================ 4-bit non-linear quants
|
|
10924
12518
|
|
|
10925
|
-
void quantize_row_iq4_nl(const float *
|
|
12519
|
+
void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
10926
12520
|
assert(k % QK4_NL == 0);
|
|
10927
12521
|
quantize_row_iq4_nl_ref(x, y, k);
|
|
10928
12522
|
}
|
|
10929
12523
|
|
|
10930
|
-
void quantize_row_iq4_xs(const float *
|
|
12524
|
+
void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
10931
12525
|
assert(k % QK_K == 0);
|
|
10932
12526
|
quantize_iq4_xs(x, y, 1, k, NULL);
|
|
10933
12527
|
}
|