@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -501,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
501
501
  }
502
502
 
503
503
  static __m256i lasx_extu8_16(__m128i a) {
504
- __m128i zero = __lsx_vldi(0);
505
- __m128i vlo = __lsx_vilvl_b(zero, a);
506
- __m128i vhi = __lsx_vilvh_b(zero, a);
507
- return lasx_set_q(vhi, vlo);
504
+ return __lasx_vext2xv_hu_bu(____m256i(a));
508
505
  }
509
506
 
510
507
  static __m256i lasx_ext8_16(__m128i a) {
511
- __m128i sign = __lsx_vslti_b(a, 0);
512
- __m128i vlo = __lsx_vilvl_b(sign, a);
513
- __m128i vhi = __lsx_vilvh_b(sign, a);
514
- return lasx_set_q(vhi, vlo);
508
+ return __lasx_vext2xv_h_b(____m256i(a));
515
509
  }
516
510
 
517
511
  static __m256i lasx_ext16_32(__m128i a) {
518
- __m256i tmp1;
519
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
520
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
521
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
522
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
523
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
524
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
525
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
526
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
527
- return tmp1;
512
+ return __lasx_vext2xv_w_h(____m256i(a));
528
513
  }
529
514
 
530
515
  static __m128i lasx_extracti128( __m256i a, int pos) {
@@ -577,6 +562,41 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) {
577
562
  return __lasx_xvpickev_b(tmp1, tmp);
578
563
  }
579
564
 
565
+ static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) {
566
+ __m256i tmp1, tmp2;
567
+ tmp1 = __lasx_xvmulwev_h_b(a, b);
568
+ tmp2 = __lasx_xvmulwod_h_b(a, b);
569
+ return __lasx_xvadd_h(tmp1, tmp2);
570
+ }
571
+
572
+ static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) {
573
+ switch (b) {
574
+ case 0: return __lasx_xvrepl128vei_h(a, 0);
575
+ case 1: return __lasx_xvrepl128vei_h(a, 1);
576
+ case 2: return __lasx_xvrepl128vei_h(a, 2);
577
+ case 3: return __lasx_xvrepl128vei_h(a, 3);
578
+ case 4: return __lasx_xvrepl128vei_h(a, 4);
579
+ case 5: return __lasx_xvrepl128vei_h(a, 5);
580
+ case 6: return __lasx_xvrepl128vei_h(a, 6);
581
+ case 7: return __lasx_xvrepl128vei_h(a, 7);
582
+ default: __builtin_unreachable();
583
+ }
584
+ }
585
+
586
+ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
587
+ switch (b) {
588
+ case 0: return __lasx_xvandi_b(a, 1 << 0);
589
+ case 1: return __lasx_xvandi_b(a, 1 << 1);
590
+ case 2: return __lasx_xvandi_b(a, 1 << 2);
591
+ case 3: return __lasx_xvandi_b(a, 1 << 3);
592
+ case 4: return __lasx_xvandi_b(a, 1 << 4);
593
+ case 5: return __lasx_xvandi_b(a, 1 << 5);
594
+ case 6: return __lasx_xvandi_b(a, 1 << 6);
595
+ case 7: return __lasx_xvandi_b(a, 1 << 7);
596
+ default: __builtin_unreachable();
597
+ }
598
+ }
599
+
580
600
  // multiply int8_t, add results pairwise twice
581
601
  static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
582
602
  // Get absolute values of x vectors
@@ -592,12 +612,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
592
612
  // horizontally add 8 floats
593
613
  static inline float hsum_float_8(const __m256 x) {
594
614
  __m128 res = lasx_extractf128(x, 1);
595
- ft_union tmp;
596
615
  res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
597
616
  res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
598
617
  res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
599
- tmp.i = __lsx_vpickve2gr_w(res, 0);
600
- return tmp.f;
618
+ return ((v4f32)res)[0];
601
619
  }
602
620
 
603
621
  // horizontally add 8 int32_t
@@ -673,13 +691,8 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
673
691
 
674
692
  // multiply int8_t, add results pairwise twice and return as float vector
675
693
  static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
676
-
677
- // Get absolute values of x vectors
678
- const __m256i ax = __lasx_xvsigncov_b(x, x);
679
- // Sign the values of the y vectors
680
- const __m256i sy = __lasx_xvsigncov_b(x, y);
681
-
682
- return mul_sum_us8_pairs_float(ax, sy);
694
+ const __m256i dot = lasx_madd_h_b(x, y);
695
+ return sum_i16_pairs_float(dot);
683
696
  }
684
697
 
685
698
  static inline __m128i packNibbles( __m256i bytes ) {
@@ -706,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
706
719
  }
707
720
  #endif //__loongarch_asx
708
721
 
709
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
722
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
710
723
  quantize_row_q4_0_ref(x, y, k);
711
724
  }
712
725
 
713
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
726
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
714
727
  quantize_row_q4_1_ref(x, y, k);
715
728
  }
716
729
 
717
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
730
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
718
731
  quantize_row_q5_0_ref(x, y, k);
719
732
  }
720
733
 
721
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
734
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
722
735
  quantize_row_q5_1_ref(x, y, k);
723
736
  }
724
737
 
725
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
738
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
726
739
  assert(QK8_0 == 32);
727
740
  assert(k % QK8_0 == 0);
728
741
  const int nb = k / QK8_0;
729
742
 
730
- block_q8_0 * restrict y = vy;
743
+ block_q8_0 * GGML_RESTRICT y = vy;
731
744
 
732
745
  #if defined(__ARM_NEON)
733
746
  for (int i = 0; i < nb; i++) {
@@ -759,7 +772,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
759
772
  y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
760
773
  }
761
774
  }
762
- #elif defined(__wasm_simd128__)
775
+ #elif defined __wasm_simd128__
763
776
  for (int i = 0; i < nb; i++) {
764
777
  v128_t srcv [8];
765
778
  v128_t asrcv[8];
@@ -939,7 +952,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
939
952
 
940
953
  #elif defined(__loongarch_asx)
941
954
  for (int i = 0; i < nb; i++) {
942
- ft_union fi;
943
955
  __m256 v0 = (__m256)__lasx_xvld( x , 0);
944
956
  __m256 v1 = (__m256)__lasx_xvld( x , 32);
945
957
  __m256 v2 = (__m256)__lasx_xvld( x , 64);
@@ -957,8 +969,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
957
969
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
958
970
  __m128 tmp = max4;
959
971
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
960
- fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
961
- const float max_scalar = fi.f;
972
+ const float max_scalar = ((v4f32)max4)[0];
962
973
 
963
974
  // Quantize these floats
964
975
  const float d = max_scalar / 127.f;
@@ -1000,6 +1011,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1000
1011
  __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
1001
1012
 
1002
1013
  }
1014
+ #elif defined(__VXE__) || defined(__VXE2__)
1015
+ for (int i = 0; i < nb; i++) {
1016
+ __vector float srcv [8];
1017
+ __vector float asrcv[8];
1018
+ __vector float amaxv[8];
1019
+
1020
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
1021
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
1022
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
1023
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
1024
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
1025
+
1026
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
1027
+ vec_extract(amaxv[0], 1)),
1028
+ MAX(vec_extract(amaxv[0], 2),
1029
+ vec_extract(amaxv[0], 3)));
1030
+
1031
+ const float d = amax / ((1 << 7) - 1);
1032
+ const float id = d ? 1.0f / d : 0.0f;
1033
+
1034
+ y[i].d = GGML_FP32_TO_FP16(d);
1035
+
1036
+ for (int j = 0; j < 8; j++) {
1037
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
1038
+ const __vector int32_t vi = vec_signed(v);
1039
+
1040
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
1041
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
1042
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
1043
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
1044
+ }
1045
+ }
1003
1046
  #else
1004
1047
  GGML_UNUSED(nb);
1005
1048
  // scalar
@@ -1007,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1007
1050
  #endif
1008
1051
  }
1009
1052
 
1010
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
1053
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1011
1054
  assert(k % QK8_1 == 0);
1012
1055
  const int nb = k / QK8_1;
1013
1056
 
1014
- block_q8_1 * restrict y = vy;
1057
+ block_q8_1 * GGML_RESTRICT y = vy;
1015
1058
 
1016
1059
  #if defined(__ARM_NEON)
1017
1060
  for (int i = 0; i < nb; i++) {
@@ -1049,7 +1092,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1049
1092
 
1050
1093
  y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
1051
1094
  }
1052
- #elif defined(__wasm_simd128__)
1095
+ #elif defined __wasm_simd128__
1053
1096
  for (int i = 0; i < nb; i++) {
1054
1097
  v128_t srcv [8];
1055
1098
  v128_t asrcv[8];
@@ -1263,7 +1306,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1263
1306
 
1264
1307
  #elif defined(__loongarch_asx)
1265
1308
  for (int i = 0; i < nb; i++) {
1266
- ft_union ft;
1267
1309
  __m256 v0 = (__m256)__lasx_xvld( x , 0 );
1268
1310
  __m256 v1 = (__m256)__lasx_xvld( x , 32 );
1269
1311
  __m256 v2 = (__m256)__lasx_xvld( x , 64 );
@@ -1281,8 +1323,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1281
1323
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
1282
1324
  __m128 tmp = max4;
1283
1325
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
1284
- ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
1285
- const float max_scalar = ft.f;
1326
+ const float max_scalar = ((v4f32)max4)[0];
1286
1327
 
1287
1328
  // Quantize these floats
1288
1329
  const float d = max_scalar / 127.f;
@@ -1328,6 +1369,44 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1328
1369
  __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
1329
1370
  __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
1330
1371
  }
1372
+ #elif defined(__VXE__) || defined(__VXE2__)
1373
+ for (int i = 0; i < nb; i++) {
1374
+ __vector float srcv [8];
1375
+ __vector float asrcv[8];
1376
+ __vector float amaxv[8];
1377
+
1378
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
1379
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
1380
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
1381
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
1382
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
1383
+
1384
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
1385
+ vec_extract(amaxv[0], 1)),
1386
+ MAX(vec_extract(amaxv[0], 2),
1387
+ vec_extract(amaxv[0], 3)));
1388
+
1389
+ const float d = amax / ((1 << 7) - 1);
1390
+ const float id = d ? 1.0f / d : 0.0f;
1391
+
1392
+ y[i].d = GGML_FP32_TO_FP16(d);
1393
+
1394
+ __vector int32_t acc = vec_splats(0);
1395
+
1396
+ for (int j = 0; j < 8; j++) {
1397
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
1398
+ const __vector int32_t vi = vec_signed(v);
1399
+
1400
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
1401
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
1402
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
1403
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
1404
+
1405
+ acc = vec_add(acc, vi);
1406
+ }
1407
+
1408
+ y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
1409
+ }
1331
1410
  #else
1332
1411
  GGML_UNUSED(nb);
1333
1412
  // scalar
@@ -1349,8 +1428,8 @@ static inline int nearest_int(float fval) {
1349
1428
  return (i & 0x007fffff) - 0x00400000;
1350
1429
  }
1351
1430
 
1352
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1353
- const float * restrict qw) {
1431
+ static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
1432
+ const float * GGML_RESTRICT qw) {
1354
1433
  float max = 0;
1355
1434
  float amax = 0;
1356
1435
  for (int i = 0; i < n; ++i) {
@@ -1418,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1418
1497
  return scale;
1419
1498
  }
1420
1499
 
1421
- static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
1500
+ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
1422
1501
  float max = 0;
1423
1502
  float amax = 0;
1424
1503
  for (int i = 0; i < n; ++i) {
@@ -1477,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
1477
1556
  return 1/iscale;
1478
1557
  }
1479
1558
 
1480
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
1559
+ static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
1481
1560
  int ntry, float alpha) {
1482
1561
  float min = x[0];
1483
1562
  float max = x[0];
@@ -1520,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
1520
1599
  return scale;
1521
1600
  }
1522
1601
 
1523
- static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1524
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1602
+ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
1603
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
1525
1604
  float rmin, float rdelta, int nstep, bool use_mad) {
1526
1605
  float min = x[0];
1527
1606
  float max = x[0];
@@ -1601,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1601
1680
  return scale;
1602
1681
  }
1603
1682
 
1604
- static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
1683
+ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
1605
1684
  if (j < 4) {
1606
1685
  *d = q[j] & 63; *m = q[j + 4] & 63;
1607
1686
  } else {
@@ -1612,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1612
1691
 
1613
1692
  //========================- 2-bit (de)-quantization
1614
1693
 
1615
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1694
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1616
1695
  quantize_row_q2_K_ref(x, vy, k);
1617
1696
  }
1618
1697
 
1619
1698
  //========================= 3-bit (de)-quantization
1620
1699
 
1621
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
1700
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1622
1701
  quantize_row_q3_K_ref(x, vy, k);
1623
1702
  }
1624
1703
 
1625
1704
  // ====================== 4-bit (de)-quantization
1626
1705
 
1627
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
1706
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1628
1707
  assert(k % QK_K == 0);
1629
- block_q4_K * restrict y = vy;
1708
+ block_q4_K * GGML_RESTRICT y = vy;
1630
1709
  quantize_row_q4_K_ref(x, y, k);
1631
1710
  }
1632
1711
 
1633
1712
  // ====================== 5-bit (de)-quantization
1634
1713
 
1635
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
1714
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1636
1715
  assert(k % QK_K == 0);
1637
- block_q5_K * restrict y = vy;
1716
+ block_q5_K * GGML_RESTRICT y = vy;
1638
1717
  quantize_row_q5_K_ref(x, y, k);
1639
1718
  }
1640
1719
 
1641
1720
  // ====================== 6-bit (de)-quantization
1642
1721
 
1643
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
1722
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1644
1723
  assert(k % QK_K == 0);
1645
- block_q6_K * restrict y = vy;
1724
+ block_q6_K * GGML_RESTRICT y = vy;
1646
1725
  quantize_row_q6_K_ref(x, y, k);
1647
1726
  }
1648
1727
 
1649
1728
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
1650
1729
 
1651
- void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
1730
+ void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1652
1731
  assert(k % QK_K == 0);
1653
- block_tq1_0 * restrict y = vy;
1732
+ block_tq1_0 * GGML_RESTRICT y = vy;
1654
1733
  quantize_row_tq1_0_ref(x, y, k);
1655
1734
  }
1656
1735
 
1657
- void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
1736
+ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1658
1737
  assert(k % QK_K == 0);
1659
- block_tq2_0 * restrict y = vy;
1738
+ block_tq2_0 * GGML_RESTRICT y = vy;
1660
1739
  quantize_row_tq2_0_ref(x, y, k);
1661
1740
  }
1662
1741
 
@@ -1664,8 +1743,88 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
1664
1743
 
1665
1744
  //===================================== Q8_K ==============================================
1666
1745
 
1667
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
1746
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1747
+ #ifdef __wasm_simd128__
1748
+ assert(k % QK_K == 0);
1749
+ const int64_t nb = k / QK_K;
1750
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
1751
+
1752
+ for (int i = 0; i < nb; i++) {
1753
+ const float * x_block = x + i * QK_K;
1754
+
1755
+ v128_t min_vec = wasm_v128_load(x_block);
1756
+ v128_t max_vec = min_vec;
1757
+
1758
+ for (int j = 4; j < QK_K; j += 4) {
1759
+ v128_t x_vec = wasm_v128_load(x_block + j);
1760
+ max_vec = wasm_f32x4_pmax(max_vec, x_vec);
1761
+ min_vec = wasm_f32x4_pmin(min_vec, x_vec);
1762
+ }
1763
+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
1764
+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
1765
+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
1766
+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
1767
+ float max = wasm_f32x4_extract_lane(max_vec, 0);
1768
+ float min = wasm_f32x4_extract_lane(min_vec, 0);
1769
+ float amax = -min > max ? min : max;
1770
+
1771
+ if (amax == 0.0f) {
1772
+ yc[i].d = 0.0f;
1773
+ const v128_t zero = wasm_i8x16_splat(0);
1774
+ for (int j = 0; j < QK_K; j += 16) {
1775
+ wasm_v128_store(yc[i].qs + j, zero);
1776
+ }
1777
+ continue;
1778
+ }
1779
+
1780
+ const float iscale = -127.0f / amax;
1781
+ const v128_t scale_vec = wasm_f32x4_splat(iscale);
1782
+
1783
+ // Process 16 elements per iteration
1784
+ for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) {
1785
+ // Load and quantize 16 floats
1786
+ v128_t x0 = wasm_v128_load(x_block + j);
1787
+ v128_t x1 = wasm_v128_load(x_block + j + 4);
1788
+ v128_t x2 = wasm_v128_load(x_block + j + 8);
1789
+ v128_t x3 = wasm_v128_load(x_block + j + 12);
1790
+
1791
+ v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec));
1792
+ v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec));
1793
+ v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec));
1794
+ v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec));
1795
+
1796
+ // Convert to i32 with saturation
1797
+ v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0);
1798
+ v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1);
1799
+ v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2);
1800
+ v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3);
1801
+
1802
+ // Pack into 16 i8 values
1803
+ v128_t i8 = wasm_i8x16_narrow_i16x8(
1804
+ wasm_i16x8_narrow_i32x4(i0, i1),
1805
+ wasm_i16x8_narrow_i32x4(i2, i3)
1806
+ );
1807
+ wasm_v128_store(yc[i].qs + j, i8);
1808
+
1809
+ // Calculate bsums using SIMD
1810
+ v128_t sum16 = wasm_i16x8_add(
1811
+ wasm_i16x8_extend_low_i8x16(i8),
1812
+ wasm_i16x8_extend_high_i8x16(i8)
1813
+ );
1814
+ v128_t sum32 = wasm_i32x4_add(
1815
+ wasm_i32x4_extend_low_i16x8(sum16),
1816
+ wasm_i32x4_extend_high_i16x8(sum16)
1817
+ );
1818
+ sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1));
1819
+ sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2));
1820
+ yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0);
1821
+ }
1822
+
1823
+ yc[i].d = 1.0f / iscale;
1824
+ }
1825
+ #else
1668
1826
  quantize_row_q8_K_ref(x, y, k);
1827
+ #endif
1669
1828
  }
1670
1829
 
1671
1830
  //===================================== Dot products =================================
@@ -1750,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
1750
1909
  }
1751
1910
  #endif
1752
1911
 
1753
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
1912
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1754
1913
  const int qk = QK8_0;
1755
1914
  const int nb = n / qk;
1756
1915
 
@@ -1765,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1765
1924
  UNUSED(by);
1766
1925
  UNUSED(bs);
1767
1926
 
1768
- const block_q4_0 * restrict x = vx;
1769
- const block_q8_0 * restrict y = vy;
1927
+ const block_q4_0 * GGML_RESTRICT x = vx;
1928
+ const block_q8_0 * GGML_RESTRICT y = vy;
1770
1929
 
1771
1930
  #if defined(__ARM_FEATURE_MATMUL_INT8)
1772
1931
  if (nrc == 2) {
1773
- const block_q4_0 * restrict vx0 = vx;
1774
- const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1775
- const block_q8_0 * restrict vy0 = vy;
1776
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1932
+ const block_q4_0 * GGML_RESTRICT vx0 = vx;
1933
+ const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1934
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
1935
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1777
1936
 
1778
1937
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
1779
1938
 
1780
1939
  for (int i = 0; i < nb; i++) {
1781
- const block_q4_0 * restrict b_x0 = &vx0[i];
1782
- const block_q4_0 * restrict b_x1 = &vx1[i];
1783
- const block_q8_0 * restrict b_y0 = &vy0[i];
1784
- const block_q8_0 * restrict b_y1 = &vy1[i];
1940
+ const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
1941
+ const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
1942
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
1943
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
1785
1944
 
1786
1945
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
1787
1946
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -1858,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1858
2017
  const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
1859
2018
 
1860
2019
  for (; ib + 1 < nb; ib += 2) {
1861
- const block_q4_0 * restrict x0 = &x[ib + 0];
1862
- const block_q4_0 * restrict x1 = &x[ib + 1];
1863
- const block_q8_0 * restrict y0 = &y[ib + 0];
1864
- const block_q8_0 * restrict y1 = &y[ib + 1];
2020
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2021
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2022
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2023
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
1865
2024
 
1866
2025
  // load x
1867
2026
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -1904,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1904
2063
  const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
1905
2064
 
1906
2065
  for (; ib + 1 < nb; ib += 2) {
1907
- const block_q4_0 * restrict x0 = &x[ib + 0];
1908
- const block_q4_0 * restrict x1 = &x[ib + 1];
1909
- const block_q8_0 * restrict y0 = &y[ib + 0];
1910
- const block_q8_0 * restrict y1 = &y[ib + 1];
2066
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2067
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2068
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2069
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
1911
2070
 
1912
2071
  // load x
1913
2072
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -1945,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1945
2104
  const svbool_t pl16 = svnot_b_z(ph32, ph16);
1946
2105
 
1947
2106
  for (; ib + 1 < nb; ib += 2) {
1948
- const block_q4_0 * restrict x0 = &x[ib + 0];
1949
- const block_q4_0 * restrict x1 = &x[ib + 1];
1950
- const block_q8_0 * restrict y0 = &y[ib + 0];
1951
- const block_q8_0 * restrict y1 = &y[ib + 1];
2107
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2108
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2109
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2110
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
1952
2111
 
1953
2112
  // load x
1954
2113
  const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
@@ -1985,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1985
2144
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
1986
2145
 
1987
2146
  for (; ib + 1 < nb; ib += 2) {
1988
- const block_q4_0 * restrict x0 = &x[ib + 0];
1989
- const block_q4_0 * restrict x1 = &x[ib + 1];
1990
- const block_q8_0 * restrict y0 = &y[ib + 0];
1991
- const block_q8_0 * restrict y1 = &y[ib + 1];
2147
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2148
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2149
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2150
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
1992
2151
 
1993
2152
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
1994
2153
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -2023,6 +2182,94 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2023
2182
  }
2024
2183
 
2025
2184
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
2185
+ #elif defined __wasm_simd128__
2186
+ v128_t sumv = wasm_f32x4_splat(0.0f);
2187
+
2188
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
2189
+ const v128_t s8b = wasm_i8x16_splat(0x8);
2190
+
2191
+ for (; ib + 1 < nb; ib += 2) {
2192
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
2193
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2194
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2195
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2196
+
2197
+ // Load and process x0
2198
+ v128_t v0_0 = wasm_v128_load(x0->qs);
2199
+ v128_t v0_0l = wasm_v128_and(v0_0, m4b);
2200
+ v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
2201
+ v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
2202
+ v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
2203
+
2204
+ // Load y0 vectors
2205
+ v128_t y0_l = wasm_v128_load(y0->qs);
2206
+ v128_t y0_h = wasm_v128_load(y0->qs + 16);
2207
+
2208
+ // Extend to i16x8 and compute dot products
2209
+ v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls);
2210
+ v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls);
2211
+ v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs);
2212
+ v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs);
2213
+
2214
+ v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l);
2215
+ v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l);
2216
+ v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h);
2217
+ v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h);
2218
+
2219
+ v128_t dp0 = wasm_i32x4_add(
2220
+ wasm_i32x4_add(
2221
+ wasm_i32x4_dot_i16x8(dx0l, dy0ll),
2222
+ wasm_i32x4_dot_i16x8(dx0h, dy0lh)
2223
+ ),
2224
+ wasm_i32x4_add(
2225
+ wasm_i32x4_dot_i16x8(dx0hl, dy0hl),
2226
+ wasm_i32x4_dot_i16x8(dx0hh, dy0hh)
2227
+ )
2228
+ );
2229
+
2230
+ // Load and process x1
2231
+ v128_t v0_1 = wasm_v128_load(x1->qs);
2232
+ v128_t v0_1l = wasm_v128_and(v0_1, m4b);
2233
+ v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
2234
+ v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
2235
+ v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
2236
+
2237
+ // Load y1 vectors
2238
+ v128_t y1_l = wasm_v128_load(y1->qs);
2239
+ v128_t y1_h = wasm_v128_load(y1->qs + 16);
2240
+
2241
+ // Extend to i16x8 and compute dot products
2242
+ v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls);
2243
+ v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls);
2244
+ v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs);
2245
+ v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs);
2246
+
2247
+ v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l);
2248
+ v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l);
2249
+ v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h);
2250
+ v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h);
2251
+
2252
+ v128_t dp1 = wasm_i32x4_add(
2253
+ wasm_i32x4_add(
2254
+ wasm_i32x4_dot_i16x8(dx1l, dy1ll),
2255
+ wasm_i32x4_dot_i16x8(dx1h, dy1lh)
2256
+ ),
2257
+ wasm_i32x4_add(
2258
+ wasm_i32x4_dot_i16x8(dx1hl, dy1hl),
2259
+ wasm_i32x4_dot_i16x8(dx1hh, dy1hh)
2260
+ )
2261
+ );
2262
+
2263
+ // Accumulate results with scaling
2264
+ float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
2265
+ float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d);
2266
+
2267
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
2268
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
2269
+ }
2270
+
2271
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
2272
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
2026
2273
  #elif defined(__AVX2__)
2027
2274
  // Initialize accumulator with zeros
2028
2275
  __m256 acc = _mm256_setzero_ps();
@@ -2311,6 +2558,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2311
2558
  }
2312
2559
 
2313
2560
  sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2561
+ #elif defined(__VXE__) || defined(__VXE2__)
2562
+ __vector float acc = vec_splats(0.0f);
2563
+
2564
+ const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
2565
+ const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
2566
+
2567
+ for (; ib < nb; ++ib) {
2568
+ const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
2569
+ const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
2570
+ const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
2571
+
2572
+ const __vector int8_t v_xls = vec_sub(v_xl, v_s);
2573
+ const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
2574
+
2575
+ const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
2576
+ const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
2577
+
2578
+ const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
2579
+ const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
2580
+ const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
2581
+ const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
2582
+
2583
+ __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
2584
+
2585
+ const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
2586
+ const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
2587
+
2588
+ acc = vec_madd(v_xy, v_d, acc);
2589
+ }
2590
+
2591
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
2314
2592
  #endif
2315
2593
  for (; ib < nb; ++ib) {
2316
2594
  int sumi0 = 0;
@@ -2331,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2331
2609
  *s = sumf;
2332
2610
  }
2333
2611
 
2334
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2612
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2335
2613
  const int qk = QK8_1;
2336
2614
  const int nb = n / qk;
2337
2615
 
@@ -2346,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2346
2624
  UNUSED(by);
2347
2625
  UNUSED(bs);
2348
2626
 
2349
- const block_q4_1 * restrict x = vx;
2350
- const block_q8_1 * restrict y = vy;
2627
+ const block_q4_1 * GGML_RESTRICT x = vx;
2628
+ const block_q8_1 * GGML_RESTRICT y = vy;
2351
2629
 
2352
2630
  #if defined(__ARM_FEATURE_MATMUL_INT8)
2353
2631
  if (nrc == 2) {
2354
- const block_q4_1 * restrict vx0 = vx;
2355
- const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2356
- const block_q8_1 * restrict vy0 = vy;
2357
- const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2632
+ const block_q4_1 * GGML_RESTRICT vx0 = vx;
2633
+ const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2634
+ const block_q8_1 * GGML_RESTRICT vy0 = vy;
2635
+ const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2358
2636
 
2359
2637
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2360
2638
  float32x4_t summs0 = vdupq_n_f32(0.0f);
2361
2639
 
2362
2640
  for (int i = 0; i < nb; i++) {
2363
- const block_q4_1 * restrict b_x0 = &vx0[i];
2364
- const block_q4_1 * restrict b_x1 = &vx1[i];
2365
- const block_q8_1 * restrict b_y0 = &vy0[i];
2366
- const block_q8_1 * restrict b_y1 = &vy1[i];
2641
+ const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
2642
+ const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
2643
+ const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
2644
+ const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
2367
2645
 
2368
2646
  float32_t summs_t[4] = {
2369
2647
  GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
@@ -2437,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2437
2715
  float summs = 0;
2438
2716
 
2439
2717
  for (; ib + 1 < nb; ib += 2) {
2440
- const block_q4_1 * restrict x0 = &x[ib + 0];
2441
- const block_q4_1 * restrict x1 = &x[ib + 1];
2442
- const block_q8_1 * restrict y0 = &y[ib + 0];
2443
- const block_q8_1 * restrict y1 = &y[ib + 1];
2718
+ const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
2719
+ const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
2720
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
2721
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
2444
2722
 
2445
2723
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
2446
2724
 
@@ -2604,6 +2882,35 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2604
2882
  }
2605
2883
 
2606
2884
  sumf = hsum_float_8(acc) + summs;
2885
+ #elif defined(__VXE__) || defined(__VXE2__)
2886
+ float summs = 0;
2887
+ float32x4_t acc = vec_splats(0.0f);
2888
+
2889
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
2890
+
2891
+ #pragma GCC unroll 4
2892
+ for (; ib < nb; ++ib) {
2893
+ __builtin_prefetch(x[ib].qs, 0, 1);
2894
+ __builtin_prefetch(y[ib].qs, 0, 1);
2895
+
2896
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
2897
+
2898
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
2899
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
2900
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
2901
+
2902
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
2903
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
2904
+
2905
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
2906
+ const float32x4_t v_xy = vec_float(v_xy_);
2907
+
2908
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
2909
+
2910
+ acc = vec_madd(v_xy, v_d, acc);
2911
+ }
2912
+
2913
+ sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
2607
2914
  #endif
2608
2915
  for (; ib < nb; ++ib) {
2609
2916
  int sumi0 = 0;
@@ -2624,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2624
2931
  *s = sumf;
2625
2932
  }
2626
2933
 
2627
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2934
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2628
2935
  const int qk = QK8_0;
2629
2936
  const int nb = n / qk;
2630
2937
 
@@ -2639,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2639
2946
  UNUSED(by);
2640
2947
  UNUSED(bs);
2641
2948
 
2642
- const block_q5_0 * restrict x = vx;
2643
- const block_q8_0 * restrict y = vy;
2949
+ const block_q5_0 * GGML_RESTRICT x = vx;
2950
+ const block_q8_0 * GGML_RESTRICT y = vy;
2644
2951
 
2645
2952
  #if defined(__ARM_NEON)
2646
2953
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2653,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2653
2960
  uint64_t tmp1[4];
2654
2961
 
2655
2962
  for (; ib + 1 < nb; ib += 2) {
2656
- const block_q5_0 * restrict x0 = &x[ib];
2657
- const block_q5_0 * restrict x1 = &x[ib + 1];
2658
- const block_q8_0 * restrict y0 = &y[ib];
2659
- const block_q8_0 * restrict y1 = &y[ib + 1];
2963
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
2964
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
2965
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2966
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2660
2967
 
2661
2968
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2662
2969
 
@@ -2709,26 +3016,26 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2709
3016
  }
2710
3017
 
2711
3018
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
2712
- #elif defined(__wasm_simd128__)
3019
+ #elif defined __wasm_simd128__
2713
3020
  v128_t sumv = wasm_f32x4_splat(0.0f);
2714
3021
 
2715
- uint32_t qh;
3022
+ uint32_t qh_;
2716
3023
  uint64_t tmp[4];
2717
3024
 
2718
3025
  // TODO: check if unrolling this is better
2719
3026
  for (; ib < nb; ++ib) {
2720
- const block_q5_0 * restrict x0 = &x[ib];
2721
- const block_q8_0 * restrict y0 = &y[ib];
3027
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
3028
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2722
3029
 
2723
3030
  const v128_t m4b = wasm_i8x16_splat(0x0F);
2724
3031
 
2725
3032
  // extract the 5th bit
2726
- memcpy(&qh, x0->qh, sizeof(qh));
3033
+ memcpy(&qh_, x0->qh, sizeof(qh_));
2727
3034
 
2728
- tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
2729
- tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
2730
- tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
2731
- tmp[3] = table_b2b_1[(qh >> 24) ];
3035
+ tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF];
3036
+ tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF];
3037
+ tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF];
3038
+ tmp[3] = table_b2b_1[(qh_ >> 24) ];
2732
3039
 
2733
3040
  const v128_t qhl = wasm_v128_load(tmp + 0);
2734
3041
  const v128_t qhh = wasm_v128_load(tmp + 2);
@@ -2979,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2979
3286
  *s = sumf;
2980
3287
  }
2981
3288
 
2982
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3289
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2983
3290
  const int qk = QK8_1;
2984
3291
  const int nb = n / qk;
2985
3292
 
@@ -2994,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2994
3301
  UNUSED(by);
2995
3302
  UNUSED(bs);
2996
3303
 
2997
- const block_q5_1 * restrict x = vx;
2998
- const block_q8_1 * restrict y = vy;
3304
+ const block_q5_1 * GGML_RESTRICT x = vx;
3305
+ const block_q8_1 * GGML_RESTRICT y = vy;
2999
3306
 
3000
3307
  #if defined(__ARM_NEON)
3001
3308
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3011,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3011
3318
  uint64_t tmp1[4];
3012
3319
 
3013
3320
  for (; ib + 1 < nb; ib += 2) {
3014
- const block_q5_1 * restrict x0 = &x[ib];
3015
- const block_q5_1 * restrict x1 = &x[ib + 1];
3016
- const block_q8_1 * restrict y0 = &y[ib];
3017
- const block_q8_1 * restrict y1 = &y[ib + 1];
3321
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3322
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
3323
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3324
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
3018
3325
 
3019
3326
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3020
3327
 
@@ -3070,30 +3377,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3070
3377
  }
3071
3378
 
3072
3379
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
3073
- #elif defined(__wasm_simd128__)
3380
+ #elif defined __wasm_simd128__
3074
3381
  v128_t sumv = wasm_f32x4_splat(0.0f);
3075
3382
 
3076
3383
  float summs = 0.0f;
3077
3384
 
3078
- uint32_t qh;
3385
+ uint32_t qh_;
3079
3386
  uint64_t tmp[4];
3080
3387
 
3081
3388
  // TODO: check if unrolling this is better
3082
3389
  for (; ib < nb; ++ib) {
3083
- const block_q5_1 * restrict x0 = &x[ib];
3084
- const block_q8_1 * restrict y0 = &y[ib];
3390
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3391
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3085
3392
 
3086
3393
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
3087
3394
 
3088
3395
  const v128_t m4b = wasm_i8x16_splat(0x0F);
3089
3396
 
3090
3397
  // extract the 5th bit
3091
- memcpy(&qh, x0->qh, sizeof(qh));
3398
+ memcpy(&qh_, x0->qh, sizeof(qh_));
3092
3399
 
3093
- tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
3094
- tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
3095
- tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
3096
- tmp[3] = table_b2b_0[(qh >> 24) ];
3400
+ tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF];
3401
+ tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF];
3402
+ tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF];
3403
+ tmp[3] = table_b2b_0[(qh_ >> 24) ];
3097
3404
 
3098
3405
  const v128_t qhl = wasm_v128_load(tmp + 0);
3099
3406
  const v128_t qhh = wasm_v128_load(tmp + 2);
@@ -3353,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3353
3660
  *s = sumf;
3354
3661
  }
3355
3662
 
3356
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3663
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3357
3664
  const int qk = QK8_0;
3358
3665
  const int nb = n / qk;
3359
3666
 
@@ -3368,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3368
3675
  UNUSED(by);
3369
3676
  UNUSED(bs);
3370
3677
 
3371
- const block_q8_0 * restrict x = vx;
3372
- const block_q8_0 * restrict y = vy;
3678
+ const block_q8_0 * GGML_RESTRICT x = vx;
3679
+ const block_q8_0 * GGML_RESTRICT y = vy;
3373
3680
 
3374
3681
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3375
3682
  if (nrc == 2) {
3376
- const block_q8_0 * restrict vx0 = vx;
3377
- const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3378
- const block_q8_0 * restrict vy0 = vy;
3379
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3683
+ const block_q8_0 * GGML_RESTRICT vx0 = vx;
3684
+ const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3685
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
3686
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3380
3687
 
3381
3688
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3382
3689
 
3383
3690
  for (int i = 0; i < nb; i++) {
3384
- const block_q8_0 * restrict b_x0 = &vx0[i];
3385
- const block_q8_0 * restrict b_y0 = &vy0[i];
3691
+ const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
3692
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
3386
3693
 
3387
- const block_q8_0 * restrict b_x1 = &vx1[i];
3388
- const block_q8_0 * restrict b_y1 = &vy1[i];
3694
+ const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
3695
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
3389
3696
 
3390
3697
  const int8x16_t x0_l = vld1q_s8(b_x0->qs);
3391
3698
  const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
@@ -3450,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3450
3757
  const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
3451
3758
 
3452
3759
  for (; ib + 1 < nb; ib += 2) {
3453
- const block_q8_0 * restrict x0 = &x[ib + 0];
3454
- const block_q8_0 * restrict x1 = &x[ib + 1];
3455
- const block_q8_0 * restrict y0 = &y[ib + 0];
3456
- const block_q8_0 * restrict y1 = &y[ib + 1];
3760
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3761
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3762
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3763
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3457
3764
 
3458
3765
  // load x
3459
3766
  const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
@@ -3481,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3481
3788
  {
3482
3789
  //printf("sve256");
3483
3790
  for (; ib + 1 < nb; ib += 2) {
3484
- const block_q8_0 * restrict x0 = &x[ib + 0];
3485
- const block_q8_0 * restrict x1 = &x[ib + 1];
3486
- const block_q8_0 * restrict y0 = &y[ib + 0];
3487
- const block_q8_0 * restrict y1 = &y[ib + 1];
3791
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3792
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3793
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3794
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3488
3795
 
3489
3796
  // load x
3490
3797
  const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
@@ -3517,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3517
3824
  svfloat32_t sumv00 = svdup_n_f32(0.0f);
3518
3825
 
3519
3826
  for (; ib + 1 < nb; ib += 2) {
3520
- const block_q8_0 * restrict x0 = &x[ib + 0];
3521
- const block_q8_0 * restrict x1 = &x[ib + 1];
3522
- const block_q8_0 * restrict y0 = &y[ib + 0];
3523
- const block_q8_0 * restrict y1 = &y[ib + 1];
3827
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3828
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3829
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3830
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3524
3831
 
3525
3832
  //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
3526
3833
  // and add them to make one 64 element vector
@@ -3560,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3560
3867
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3561
3868
 
3562
3869
  for (; ib + 1 < nb; ib += 2) {
3563
- const block_q8_0 * restrict x0 = &x[ib + 0];
3564
- const block_q8_0 * restrict x1 = &x[ib + 1];
3565
- const block_q8_0 * restrict y0 = &y[ib + 0];
3566
- const block_q8_0 * restrict y1 = &y[ib + 1];
3870
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3871
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3872
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3873
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3567
3874
 
3568
3875
  const int8x16_t x0_0 = vld1q_s8(x0->qs);
3569
3876
  const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3586,6 +3893,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3586
3893
  }
3587
3894
 
3588
3895
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
3896
+ #elif defined __wasm_simd128__
3897
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3898
+
3899
+ for (; ib < nb; ++ib) {
3900
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
3901
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
3902
+
3903
+ const v128_t x0_0 = wasm_v128_load(x0->qs);
3904
+ const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
3905
+ const v128_t y0_0 = wasm_v128_load(y0->qs);
3906
+ const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
3907
+
3908
+ // Extend 8-bit to 16-bit
3909
+ const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0);
3910
+ const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0);
3911
+ const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1);
3912
+ const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1);
3913
+
3914
+ const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0);
3915
+ const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0);
3916
+ const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1);
3917
+ const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1);
3918
+
3919
+ // Compute dot products
3920
+ const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l);
3921
+ const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h);
3922
+ const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l);
3923
+ const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h);
3924
+
3925
+ // Sum all dot products
3926
+ const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
3927
+
3928
+ // Convert to float and accumulate
3929
+ const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
3930
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
3931
+ }
3932
+
3933
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3934
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
3589
3935
  #elif defined(__AVX2__)
3590
3936
  // Initialize accumulator with zeros
3591
3937
  __m256 acc = _mm256_setzero_ps();
@@ -3699,6 +4045,27 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3699
4045
  }
3700
4046
 
3701
4047
  sumf = hsum_float_8(acc);
4048
+ #elif defined(__VXE__) || defined(__VXE2__)
4049
+ __vector float acc = vec_splats(0.0f);
4050
+
4051
+ #pragma GCC unroll 8
4052
+ for (; ib < nb; ++ib) {
4053
+ __builtin_prefetch(x[ib].qs, 0, 1);
4054
+ __builtin_prefetch(y[ib].qs, 0, 1);
4055
+
4056
+ const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
4057
+ const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
4058
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
4059
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
4060
+
4061
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
4062
+ const float32x4_t v_xy = vec_float(v_xy_);
4063
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
4064
+
4065
+ acc = vec_madd(v_xy, v_d, acc);
4066
+ }
4067
+
4068
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
3702
4069
  #endif
3703
4070
  for (; ib < nb; ++ib) {
3704
4071
  int sumi = 0;
@@ -3713,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3713
4080
  *s = sumf;
3714
4081
  }
3715
4082
 
3716
- void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4083
+ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3717
4084
  assert(nrc == 1);
3718
4085
  UNUSED(nrc);
3719
4086
  UNUSED(bx);
3720
4087
  UNUSED(by);
3721
4088
  UNUSED(bs);
3722
4089
 
3723
- const block_tq1_0 * restrict x = vx;
3724
- const block_q8_K * restrict y = vy;
4090
+ const block_tq1_0 * GGML_RESTRICT x = vx;
4091
+ const block_q8_K * GGML_RESTRICT y = vy;
3725
4092
 
3726
4093
  const int nb = n / QK_K;
3727
4094
 
@@ -4036,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
4036
4403
  #endif
4037
4404
  }
4038
4405
 
4039
- void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4406
+ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4040
4407
  assert(nrc == 1);
4041
4408
  UNUSED(nrc);
4042
4409
  UNUSED(bx);
4043
4410
  UNUSED(by);
4044
4411
  UNUSED(bs);
4045
4412
 
4046
- const block_tq2_0 * restrict x = vx;
4047
- const block_q8_K * restrict y = vy;
4413
+ const block_tq2_0 * GGML_RESTRICT x = vx;
4414
+ const block_q8_K * GGML_RESTRICT y = vy;
4048
4415
 
4049
4416
  const int nb = n / QK_K;
4050
4417
 
@@ -4208,19 +4575,264 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
4208
4575
  #endif
4209
4576
  }
4210
4577
 
4211
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4578
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4212
4579
  assert(nrc == 1);
4213
4580
  UNUSED(nrc);
4214
4581
  UNUSED(bx);
4215
4582
  UNUSED(by);
4216
4583
  UNUSED(bs);
4217
4584
 
4218
- const block_q2_K * restrict x = vx;
4219
- const block_q8_K * restrict y = vy;
4585
+ const block_q2_K * GGML_RESTRICT x = vx;
4586
+ const block_q8_K * GGML_RESTRICT y = vy;
4220
4587
 
4221
4588
  const int nb = n / QK_K;
4222
4589
 
4223
- #ifdef __ARM_NEON
4590
+ #ifdef __ARM_FEATURE_SVE
4591
+ const int vector_length = svcntb()*8;
4592
+ const svuint8_t m3s = svdup_n_u8(0x3);
4593
+ const svuint32_t m4s = svdup_n_u32(0xF);
4594
+ const svint32_t vzero_sv = svdup_n_s32(0);
4595
+ svfloat32_t acc_sum = svdup_n_f32(0);
4596
+ svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
4597
+
4598
+ switch (vector_length) {
4599
+ case 128:
4600
+ for (int i = 0; i < nb; ++i) {
4601
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4602
+ svfloat32_t d_broad = svdup_n_f32((float32_t)d);
4603
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4604
+ svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4605
+
4606
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4607
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4608
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4609
+
4610
+ svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
4611
+ const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4612
+
4613
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
4614
+ const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4615
+
4616
+ svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
4617
+ svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
4618
+
4619
+ const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
4620
+
4621
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
4622
+ const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4623
+
4624
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
4625
+ const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4626
+
4627
+ q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
4628
+ q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
4629
+
4630
+ svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
4631
+
4632
+ svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
4633
+
4634
+ acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
4635
+
4636
+ svint32_t sumi1 = svdup_n_s32(0);
4637
+
4638
+ {
4639
+ const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
4640
+ svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
4641
+ svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4642
+ const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
4643
+
4644
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
4645
+
4646
+ const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
4647
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
4648
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4649
+
4650
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
4651
+
4652
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
4653
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4654
+
4655
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
4656
+
4657
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
4658
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4659
+
4660
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
4661
+
4662
+
4663
+ const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
4664
+
4665
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
4666
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4667
+
4668
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
4669
+
4670
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
4671
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4672
+
4673
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
4674
+
4675
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
4676
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4677
+
4678
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
4679
+
4680
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
4681
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4682
+
4683
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
4684
+
4685
+ //-------------------------------
4686
+
4687
+ q2 += 32;
4688
+ const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
4689
+ const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
4690
+
4691
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
4692
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4693
+
4694
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
4695
+
4696
+ const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
4697
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
4698
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4699
+
4700
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
4701
+
4702
+
4703
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
4704
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4705
+
4706
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
4707
+
4708
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
4709
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4710
+
4711
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
4712
+
4713
+
4714
+ const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
4715
+
4716
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
4717
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4718
+
4719
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
4720
+
4721
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
4722
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4723
+
4724
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
4725
+
4726
+
4727
+
4728
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
4729
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4730
+
4731
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
4732
+
4733
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
4734
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4735
+
4736
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
4737
+ }
4738
+ acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
4739
+ }
4740
+ *s = svaddv_f32(svptrue_b32(), acc_sum);
4741
+ break;
4742
+
4743
+ case 256:
4744
+ case 512:
4745
+ for (int i = 0; i < nb; ++i) {
4746
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4747
+ svfloat32_t d_broad = svdup_n_f32((float32_t)d);
4748
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4749
+ svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4750
+
4751
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4752
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4753
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4754
+
4755
+ const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
4756
+ const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
4757
+ const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
4758
+ svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
4759
+
4760
+ const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
4761
+ const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
4762
+ const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
4763
+
4764
+ svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
4765
+
4766
+ svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
4767
+
4768
+ acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
4769
+
4770
+ svint32_t sumi1 = svdup_n_s32(0);
4771
+
4772
+ {
4773
+ const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
4774
+ svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
4775
+ svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4776
+
4777
+ svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
4778
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4779
+
4780
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
4781
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4782
+
4783
+ svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
4784
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
4785
+
4786
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
4787
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4788
+
4789
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
4790
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4791
+
4792
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
4793
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4794
+
4795
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
4796
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4797
+
4798
+ q2 += 32;
4799
+
4800
+ const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
4801
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
4802
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4803
+
4804
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
4805
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4806
+
4807
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
4808
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4809
+
4810
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
4811
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4812
+
4813
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
4814
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4815
+
4816
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
4817
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4818
+
4819
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
4820
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4821
+
4822
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
4823
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4824
+ }
4825
+ acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
4826
+ }
4827
+ *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
4828
+ break;
4829
+
4830
+ default:
4831
+ assert(false && "Unsupported vector length");
4832
+ break;
4833
+ }
4834
+
4835
+ #elif __ARM_NEON
4224
4836
  const uint8x16_t m3 = vdupq_n_u8(0x3);
4225
4837
  const uint8x16_t m4 = vdupq_n_u8(0xF);
4226
4838
 
@@ -4235,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4235
4847
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4236
4848
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4237
4849
 
4238
- const uint8_t * restrict q2 = x[i].qs;
4239
- const int8_t * restrict q8 = y[i].qs;
4240
- const uint8_t * restrict sc = x[i].scales;
4850
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4851
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4852
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4241
4853
 
4242
4854
  const uint8x16_t mins_and_scales = vld1q_u8(sc);
4243
4855
  const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -4300,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4300
4912
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4301
4913
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4302
4914
 
4303
- const uint8_t * restrict q2 = x[i].qs;
4304
- const int8_t * restrict q8 = y[i].qs;
4915
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4916
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4305
4917
 
4306
4918
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
4307
4919
  const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -4367,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4367
4979
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4368
4980
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4369
4981
 
4370
- const uint8_t * restrict q2 = x[i].qs;
4371
- const int8_t * restrict q8 = y[i].qs;
4982
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4983
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4372
4984
 
4373
4985
  // load mins and scales from block_q2_K.scales[QK_K/16]
4374
4986
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -4460,6 +5072,106 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4460
5072
 
4461
5073
  *s = hsum_float_8(acc);
4462
5074
 
5075
+ #elif defined __wasm_simd128__
5076
+ float sumf = 0;
5077
+
5078
+ for (int i = 0; i < nb; ++i) {
5079
+ const uint8_t * q2 = x[i].qs;
5080
+ const int8_t * q8 = y[i].qs;
5081
+ const uint8_t * sc = x[i].scales;
5082
+
5083
+ // Vectorized summs calculation
5084
+ v128_t summs_vec = wasm_i32x4_splat(0);
5085
+ {
5086
+ v128_t sc_vec = wasm_v128_load(sc);
5087
+ v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4);
5088
+
5089
+ v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper);
5090
+ v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper);
5091
+
5092
+ v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]);
5093
+ v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]);
5094
+
5095
+ summs_vec = wasm_i32x4_add(
5096
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1),
5097
+ wasm_i32x4_dot_i16x8(sc_high, bsums2)),
5098
+ summs_vec
5099
+ );
5100
+
5101
+ summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1));
5102
+ summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2));
5103
+ }
5104
+ int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0);
5105
+
5106
+ // Vectorized isum calculation
5107
+ int32_t isum = 0;
5108
+ const uint8_t * sc_ptr = sc;
5109
+ const int k_iters = QK_K/128;
5110
+
5111
+ for (int k = 0; k < k_iters; ++k) {
5112
+ v128_t isum_vec = wasm_i32x4_splat(0);
5113
+ int shift = 0;
5114
+
5115
+ for (int j = 0; j < 4; ++j) {
5116
+ const int d0 = (sc_ptr[0] & 0xF);
5117
+ const int d1 = (sc_ptr[1] & 0xF);
5118
+ sc_ptr += 2;
5119
+
5120
+ // Process first 16 elements
5121
+ v128_t q2_0 = wasm_v128_load(q2);
5122
+ v128_t q8_0 = wasm_v128_load(q8);
5123
+ v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift);
5124
+ v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03));
5125
+
5126
+ // Process next 16 elements
5127
+ v128_t q2_1 = wasm_v128_load(q2 + 16);
5128
+ v128_t q8_1 = wasm_v128_load(q8 + 16);
5129
+ v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift);
5130
+ v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03));
5131
+
5132
+ // Calculate dot products
5133
+ v128_t p0 = wasm_i32x4_dot_i16x8(
5134
+ wasm_i16x8_extend_low_i8x16(q8_0),
5135
+ wasm_i16x8_extend_low_i8x16(q2_bits_0)
5136
+ );
5137
+ v128_t p1 = wasm_i32x4_dot_i16x8(
5138
+ wasm_i16x8_extend_high_i8x16(q8_0),
5139
+ wasm_i16x8_extend_high_i8x16(q2_bits_0)
5140
+ );
5141
+ v128_t p2 = wasm_i32x4_dot_i16x8(
5142
+ wasm_i16x8_extend_low_i8x16(q8_1),
5143
+ wasm_i16x8_extend_low_i8x16(q2_bits_1)
5144
+ );
5145
+ v128_t p3 = wasm_i32x4_dot_i16x8(
5146
+ wasm_i16x8_extend_high_i8x16(q8_1),
5147
+ wasm_i16x8_extend_high_i8x16(q2_bits_1)
5148
+ );
5149
+
5150
+ // Accumulate scaled results
5151
+ v128_t scaled = wasm_i32x4_add(
5152
+ wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)),
5153
+ wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1))
5154
+ );
5155
+
5156
+ isum_vec = wasm_i32x4_add(isum_vec, scaled);
5157
+ q8 += 32;
5158
+ shift += 2;
5159
+ }
5160
+ q2 += 32;
5161
+
5162
+ // Horizontal sum of isum_vec
5163
+ isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1));
5164
+ isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2));
5165
+ isum += wasm_i32x4_extract_lane(isum_vec, 0);
5166
+ }
5167
+
5168
+ const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
5169
+ const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
5170
+ sumf += dall * isum - dmin * summs;
5171
+ }
5172
+
5173
+ *s = sumf;
5174
+
4463
5175
  #elif defined __riscv_v_intrinsic
4464
5176
 
4465
5177
  float sumf = 0;
@@ -4594,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4594
5306
  vector signed int vsumi6 = v0;
4595
5307
  vector signed int vsumi7 = v0;
4596
5308
 
4597
- const uint8_t * restrict q2 = x[i].qs;
4598
- const int8_t * restrict q8 = y[i].qs;
5309
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5310
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4599
5311
 
4600
5312
  for (int j = 0; j < QK_K/128; ++j) {
4601
5313
  __builtin_prefetch(q2, 0, 1);
@@ -4679,9 +5391,6 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4679
5391
 
4680
5392
  #elif defined __loongarch_asx
4681
5393
 
4682
- const __m256i m3 = __lasx_xvreplgr2vr_b(3);
4683
- const __m128i m4 = __lsx_vreplgr2vr_b(0xF);
4684
-
4685
5394
  __m256 acc = (__m256)__lasx_xvldi(0);
4686
5395
 
4687
5396
  for (int i = 0; i < nb; ++i) {
@@ -4689,21 +5398,18 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4689
5398
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4690
5399
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4691
5400
 
4692
- const uint8_t * restrict q2 = x[i].qs;
4693
- const int8_t * restrict q8 = y[i].qs;
5401
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5402
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4694
5403
 
4695
- const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
4696
- const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
4697
- const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
4698
- const __m256i mins = lasx_ext8_16(mins8);
5404
+ const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
5405
+ const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
5406
+ const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4));
4699
5407
  const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
4700
5408
 
4701
5409
  acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
4702
5410
 
4703
- const __m256i all_scales = lasx_ext8_16(scales8);
4704
- const __m128i l_scales = lasx_extracti128(all_scales, 0);
4705
- const __m128i h_scales = lasx_extracti128(all_scales, 1);
4706
- const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
5411
+ const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
5412
+ const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
4707
5413
 
4708
5414
  __m256i sumi = __lasx_xvldi(0);
4709
5415
 
@@ -4716,20 +5422,20 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4716
5422
  const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
4717
5423
  const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
4718
5424
 
4719
- const __m256i q2_0 = __lasx_xvand_v(q2bits, m3);
4720
- const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3);
4721
- const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3);
4722
- const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3);
5425
+ const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3);
5426
+ const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3);
5427
+ const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3);
5428
+ const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6);
4723
5429
 
4724
- __m256i p0 = lasx_maddubs_h(q2_0, q8_0);
4725
- __m256i p1 = lasx_maddubs_h(q2_1, q8_1);
4726
- __m256i p2 = lasx_maddubs_h(q2_2, q8_2);
4727
- __m256i p3 = lasx_maddubs_h(q2_3, q8_3);
5430
+ __m256i p0 = lasx_madd_h_b(q2_0, q8_0);
5431
+ __m256i p1 = lasx_madd_h_b(q2_1, q8_1);
5432
+ __m256i p2 = lasx_madd_h_b(q2_2, q8_2);
5433
+ __m256i p3 = lasx_madd_h_b(q2_3, q8_3);
4728
5434
 
4729
- p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0);
4730
- p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1);
4731
- p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2);
4732
- p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3);
5435
+ p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0);
5436
+ p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1);
5437
+ p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2);
5438
+ p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3);
4733
5439
 
4734
5440
  p0 = __lasx_xvadd_w(p0, p1);
4735
5441
  p2 = __lasx_xvadd_w(p2, p3);
@@ -4786,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4786
5492
  #endif
4787
5493
  }
4788
5494
 
4789
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5495
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4790
5496
  assert(n % QK_K == 0);
4791
5497
  assert(nrc == 1);
4792
5498
  UNUSED(nrc);
@@ -4797,12 +5503,187 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4797
5503
  const uint32_t kmask1 = 0x03030303;
4798
5504
  const uint32_t kmask2 = 0x0f0f0f0f;
4799
5505
 
4800
- const block_q3_K * restrict x = vx;
4801
- const block_q8_K * restrict y = vy;
5506
+ const block_q3_K * GGML_RESTRICT x = vx;
5507
+ const block_q8_K * GGML_RESTRICT y = vy;
4802
5508
 
4803
5509
  const int nb = n / QK_K;
4804
5510
 
4805
- #ifdef __ARM_NEON
5511
+ #if defined(__ARM_FEATURE_SVE)
5512
+
5513
+ uint32_t aux[3];
5514
+ uint32_t utmp[4];
5515
+
5516
+ const int8_t m32 = 32;
5517
+ const int vector_length = svcntb()*8;
5518
+ const svuint8_t m3b_sv = svdup_n_u8(0x3);
5519
+ const svint32_t vzero_sv = svdup_n_s32(0);
5520
+
5521
+ const svuint8_t m0_sv = svdup_n_u8(1);
5522
+ const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
5523
+ const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
5524
+ const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
5525
+
5526
+ float sum = 0;
5527
+
5528
+ for (int i = 0; i < nb; ++i) {
5529
+
5530
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5531
+
5532
+ const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
5533
+ const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
5534
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
5535
+
5536
+ // Set up scales
5537
+ memcpy(aux, x[i].scales, 12);
5538
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
5539
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
5540
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
5541
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
5542
+
5543
+ int8_t * scale = (int8_t *)utmp;
5544
+
5545
+ for (int j = 0; j < 16; ++j) scale[j] -= m32;
5546
+
5547
+ switch (vector_length) {
5548
+ case 128:
5549
+ {
5550
+ svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
5551
+ svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
5552
+ svuint8_t q3h_sv;
5553
+
5554
+ svint32_t sumi1_1 = svdup_n_s32(0);
5555
+ svint8_t q3bytes_sv;
5556
+
5557
+ for (int j = 0; j < QK_K/128; ++j) {
5558
+
5559
+ const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5560
+ const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5561
+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5562
+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5563
+
5564
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
5565
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5566
+
5567
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
5568
+
5569
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
5570
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5571
+
5572
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
5573
+
5574
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5575
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5576
+
5577
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
5578
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5579
+
5580
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
5581
+
5582
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
5583
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5584
+
5585
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
5586
+
5587
+
5588
+ scale += 4;
5589
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5590
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5591
+
5592
+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
5593
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5594
+
5595
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
5596
+
5597
+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
5598
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5599
+
5600
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
5601
+
5602
+
5603
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5604
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5605
+
5606
+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
5607
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5608
+
5609
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
5610
+
5611
+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
5612
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5613
+
5614
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
5615
+
5616
+ if (j == 0) {
5617
+ qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
5618
+ qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
5619
+ }
5620
+
5621
+ scale += 4;
5622
+ }
5623
+
5624
+ sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
5625
+ } break;
5626
+ case 256:
5627
+ case 512:
5628
+ {
5629
+ svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
5630
+ svuint8_t q3h_sv;
5631
+
5632
+ svint32_t sumi1_1 = svdup_n_s32(0);
5633
+ svint8_t q3bytes_sv;
5634
+
5635
+ for (int j = 0; j < QK_K/128; ++j) {
5636
+
5637
+ const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
5638
+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5639
+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5640
+
5641
+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
5642
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5643
+
5644
+
5645
+ svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5646
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
5647
+
5648
+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
5649
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5650
+
5651
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5652
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
5653
+
5654
+ scale += 4;
5655
+ q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5656
+ q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5657
+
5658
+ q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
5659
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5660
+
5661
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5662
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
5663
+
5664
+ q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
5665
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5666
+
5667
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5668
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
5669
+
5670
+ if (j == 0) {
5671
+ qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
5672
+ }
5673
+
5674
+ scale += 4;
5675
+ }
5676
+
5677
+ sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
5678
+ } break;
5679
+ default:
5680
+ assert(false && "Unsupported vector length");
5681
+ break;
5682
+ }
5683
+ }
5684
+ *s = sum;
5685
+
5686
+ #elif __ARM_NEON
4806
5687
 
4807
5688
  uint32_t aux[3];
4808
5689
  uint32_t utmp[4];
@@ -4824,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4824
5705
 
4825
5706
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4826
5707
 
4827
- const uint8_t * restrict q3 = x[i].qs;
4828
- const uint8_t * restrict qh = x[i].hmask;
4829
- const int8_t * restrict q8 = y[i].qs;
5708
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5709
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
5710
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4830
5711
 
4831
5712
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
4832
5713
 
@@ -4910,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4910
5791
 
4911
5792
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4912
5793
 
4913
- const uint8_t * restrict q3 = x[i].qs;
4914
- const int8_t * restrict q8 = y[i].qs;
5794
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4915
5796
 
4916
5797
  // Set up scales
4917
5798
  memcpy(aux, x[i].scales, 12);
@@ -5015,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5015
5896
 
5016
5897
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5017
5898
 
5018
- const uint8_t * restrict q3 = x[i].qs;
5019
- const int8_t * restrict q8 = y[i].qs;
5899
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5900
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5020
5901
 
5021
5902
  // Set up scales
5022
5903
  aux = (const uint32_t *)x[i].scales;
@@ -5142,6 +6023,94 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5142
6023
 
5143
6024
  *s = hsum_float_8(acc);
5144
6025
 
6026
+ #elif defined __wasm_simd128__
6027
+ int8_t aux8[QK_K];
6028
+ float sums[8] = {0};
6029
+ uint32_t auxs[4];
6030
+
6031
+ float sumf = 0;
6032
+ for (int i = 0; i < nb; ++i) {
6033
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6034
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6035
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6036
+
6037
+ // Process blocks with SIMD
6038
+ int8_t * a = aux8;
6039
+ uint8_t m = 1;
6040
+ for (int j = 0; j < QK_K; j += 128) {
6041
+ for (int shift = 0; shift <= 6; shift += 2) {
6042
+ v128_t v_m = wasm_i8x16_splat(m);
6043
+ for (int l = 0; l < 32; l += 16) {
6044
+ v128_t v_q3 = wasm_v128_load(q3 + l);
6045
+ v128_t v_shift = wasm_i8x16_shr(v_q3, shift);
6046
+ v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03));
6047
+
6048
+ v128_t v_hm = wasm_v128_load(hm + l);
6049
+ v128_t v_mask = wasm_v128_and(v_hm, v_m);
6050
+ v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0));
6051
+
6052
+ v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask)));
6053
+ wasm_v128_store(a + l, v_low2);
6054
+ }
6055
+ a += 32;
6056
+ m <<= 1;
6057
+ }
6058
+ q3 += 32;
6059
+ }
6060
+
6061
+ // Extract scales
6062
+ memcpy(auxs, x[i].scales, 12);
6063
+ uint32_t tmp = auxs[2];
6064
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
6065
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
6066
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
6067
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
6068
+ const int8_t * scales = (const int8_t *)auxs;
6069
+
6070
+ // SIMD dot product with register accumulators
6071
+ v128_t v_acc0 = wasm_i32x4_splat(0);
6072
+ v128_t v_acc1 = wasm_i32x4_splat(0);
6073
+ a = aux8;
6074
+ for (int j = 0; j < QK_K/16; ++j) {
6075
+ const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32);
6076
+
6077
+ // Process 16 elements per iteration
6078
+ for (int k = 0; k < 2; ++k) {
6079
+ const v128_t v_q8 = wasm_i16x8_load8x8(q8);
6080
+ const v128_t v_a = wasm_i16x8_load8x8(a);
6081
+
6082
+ v128_t v_prod = wasm_i16x8_mul(v_q8, v_a);
6083
+ v_prod = wasm_i16x8_mul(v_prod, v_scale);
6084
+
6085
+ v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod));
6086
+ v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod));
6087
+
6088
+ q8 += 8;
6089
+ a += 8;
6090
+ }
6091
+ }
6092
+
6093
+ // Accumulate results
6094
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
6095
+ const v128_t v_d = wasm_f32x4_splat(d);
6096
+ v128_t v_sum = wasm_f32x4_add(
6097
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
6098
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d)
6099
+ );
6100
+
6101
+ // Accumulate into sums vector
6102
+ wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum));
6103
+ }
6104
+
6105
+ // Horizontal sum
6106
+ v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4));
6107
+ sumf = wasm_f32x4_extract_lane(v_sum, 0) +
6108
+ wasm_f32x4_extract_lane(v_sum, 1) +
6109
+ wasm_f32x4_extract_lane(v_sum, 2) +
6110
+ wasm_f32x4_extract_lane(v_sum, 3);
6111
+
6112
+ *s = sumf;
6113
+
5145
6114
  #elif defined __riscv_v_intrinsic
5146
6115
 
5147
6116
  uint32_t aux[3];
@@ -5150,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5150
6119
  float sumf = 0;
5151
6120
  for (int i = 0; i < nb; ++i) {
5152
6121
 
5153
- const uint8_t * restrict q3 = x[i].qs;
5154
- const uint8_t * restrict qh = x[i].hmask;
5155
- const int8_t * restrict q8 = y[i].qs;
6122
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6123
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
6124
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5156
6125
 
5157
6126
  memcpy(aux, x[i].scales, 12);
5158
6127
  utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -5292,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5292
6261
  vector signed int vsumi6 = v0;
5293
6262
  vector signed int vsumi7 = v0;
5294
6263
 
5295
- const uint8_t * restrict q3 = x[i].qs;
5296
- const int8_t * restrict q8 = y[i].qs;
6264
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6265
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5297
6266
 
5298
6267
  for (int j = 0; j < QK_K/128; ++j) {
5299
6268
  __builtin_prefetch(q3, 0, 1);
@@ -5397,8 +6366,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5397
6366
 
5398
6367
  #elif defined __loongarch_asx
5399
6368
 
5400
- const __m256i m3 = __lasx_xvreplgr2vr_b(3);
5401
- const __m256i mone = __lasx_xvreplgr2vr_b(1);
5402
6369
  const __m128i m32 = __lsx_vreplgr2vr_b(32);
5403
6370
 
5404
6371
  __m256 acc = (__m256)__lasx_xvldi(0);
@@ -5408,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5408
6375
  for (int i = 0; i < nb; ++i) {
5409
6376
 
5410
6377
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5411
- const uint8_t * restrict q3 = x[i].qs;
5412
- const int8_t * restrict q8 = y[i].qs;
6378
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5413
6380
  // Set up scales
5414
6381
  memcpy(aux, x[i].scales, 12);
5415
6382
  __m128i scales128 = lsx_set_w(
@@ -5418,10 +6385,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5418
6385
  (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
5419
6386
  (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
5420
6387
  scales128 = __lsx_vsub_b(scales128, m32);
5421
- const __m256i all_scales = lasx_ext8_16(scales128);
5422
- const __m128i l_scales = lasx_extracti128(all_scales, 0);
5423
- const __m128i h_scales = lasx_extracti128(all_scales, 1);
5424
- const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
6388
+
6389
+ const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
6390
+ const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
5425
6391
 
5426
6392
  // high bit
5427
6393
  const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
@@ -5429,35 +6395,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5429
6395
  // integer accumulator
5430
6396
  __m256i sumi = __lasx_xvldi(0);
5431
6397
 
5432
- int bit = 0;
5433
- int is = 0;
5434
- __m256i xvbit;
5435
-
5436
-
5437
6398
  for (int j = 0; j < QK_K/128; ++j) {
5438
6399
  // load low 2 bits
5439
6400
  const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
5440
6401
 
5441
- xvbit = __lasx_xvreplgr2vr_h(bit);
5442
6402
  // prepare low and high bits
5443
- const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
5444
- const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
5445
- ++bit;
5446
-
5447
- xvbit = __lasx_xvreplgr2vr_h(bit);
5448
- const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
5449
- const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
5450
- ++bit;
5451
-
5452
- xvbit = __lasx_xvreplgr2vr_h(bit);
5453
- const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
5454
- const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
5455
- ++bit;
5456
-
5457
- xvbit = __lasx_xvreplgr2vr_h(bit);
5458
- const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
5459
- const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
5460
- ++bit;
6403
+ const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3);
6404
+ const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3);
6405
+ const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3);
6406
+ const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6);
6407
+ const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2);
6408
+ const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2);
6409
+ const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2);
6410
+ const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2);
6411
+ const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0);
6412
+ const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1);
6413
+ const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2);
6414
+ const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3);
5461
6415
 
5462
6416
  // load Q8 quants
5463
6417
  const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
@@ -5465,29 +6419,16 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5465
6419
  const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
5466
6420
  const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
5467
6421
 
5468
- // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h,
5469
- // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
5470
- // and 2 if the high bit was set)
5471
- __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0);
5472
- __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
5473
- __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2);
5474
- __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3);
5475
-
5476
- __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
5477
- __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
5478
- __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2);
5479
- __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3);
5480
-
5481
- p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
5482
- p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
5483
- p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
5484
- p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
6422
+ __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0);
6423
+ __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1);
6424
+ __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2);
6425
+ __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3);
5485
6426
 
5486
6427
  // multiply with scales
5487
- p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
5488
- p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
5489
- p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
5490
- p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
6428
+ p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
6429
+ p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
6430
+ p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
6431
+ p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
5491
6432
 
5492
6433
  // accumulate
5493
6434
  p16_0 = __lasx_xvadd_w(p16_0, p16_1);
@@ -5495,7 +6436,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5495
6436
  sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
5496
6437
  }
5497
6438
  // multiply with block scale and accumulate
5498
- acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME
6439
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
5499
6440
  }
5500
6441
 
5501
6442
  *s = hsum_float_8(acc);
@@ -5520,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5520
6461
 
5521
6462
  float sumf = 0;
5522
6463
  for (int i = 0; i < nb; ++i) {
5523
- const uint8_t * restrict q3 = x[i].qs;
5524
- const uint8_t * restrict hm = x[i].hmask;
5525
- const int8_t * restrict q8 = y[i].qs;
6464
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6465
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6466
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5526
6467
  memset(aux32, 0, 8*sizeof(int32_t));
5527
- int8_t * restrict a = aux8;
6468
+ int8_t * GGML_RESTRICT a = aux8;
5528
6469
  uint8_t m = 1;
5529
6470
  for (int j = 0; j < QK_K; j += 128) {
5530
6471
  for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -5567,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5567
6508
 
5568
6509
  }
5569
6510
 
5570
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6511
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
5571
6512
  assert(n % QK_K == 0);
5572
6513
  assert(nrc == 1);
5573
6514
  UNUSED(nrc);
@@ -5575,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5575
6516
  UNUSED(by);
5576
6517
  UNUSED(bs);
5577
6518
 
5578
- const block_q4_K * restrict x = vx;
5579
- const block_q8_K * restrict y = vy;
6519
+ const block_q4_K * GGML_RESTRICT x = vx;
6520
+ const block_q8_K * GGML_RESTRICT y = vy;
5580
6521
 
5581
6522
  const int nb = n / QK_K;
5582
6523
 
@@ -5611,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5611
6552
 
5612
6553
  const uint8_t * scales = (const uint8_t *)utmp;
5613
6554
 
5614
- const uint8_t * restrict q4 = x[i].qs;
5615
- const int8_t * restrict q8 = y[i].qs;
6555
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6556
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5616
6557
 
5617
6558
  const int vector_length = ggml_cpu_get_sve_cnt()*8;
5618
6559
  const svuint8_t m4b = svdup_n_u8(0xf);
@@ -5667,7 +6608,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5667
6608
  }
5668
6609
  }
5669
6610
  *s = sumf;
5670
- #elif __ARM_NEON
6611
+ #elif defined __ARM_NEON
5671
6612
  const uint8x16_t m4b = vdupq_n_u8(0xf);
5672
6613
  const int32x4_t mzero = vdupq_n_s32(0);
5673
6614
 
@@ -5699,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5699
6640
 
5700
6641
  const uint8_t * scales = (const uint8_t *)utmp;
5701
6642
 
5702
- const uint8_t * restrict q4 = x[i].qs;
5703
- const int8_t * restrict q8 = y[i].qs;
6643
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6644
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5704
6645
 
5705
6646
  int32_t sumi1 = 0;
5706
6647
  int32_t sumi2 = 0;
@@ -5712,20 +6653,121 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5712
6653
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5713
6654
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5714
6655
 
5715
- const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
5716
- sumi1 += vaddvq_s32(p1) * scales[2*j+0];
6656
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
6657
+ sumi1 += vaddvq_s32(p1) * scales[2*j+0];
6658
+
6659
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
6660
+ q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
6661
+ q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
6662
+
6663
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
6664
+
6665
+ sumi2 += vaddvq_s32(p2) * scales[2*j+1];
6666
+ }
6667
+
6668
+ sumf += d * (sumi1 + sumi2);
6669
+
6670
+ }
6671
+
6672
+ *s = sumf;
6673
+
6674
+ #elif defined __wasm_simd128__
6675
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
6676
+ float sumf = 0;
6677
+
6678
+ for (int i = 0; i < nb; ++i) {
6679
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6680
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
6681
+
6682
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6684
+
6685
+ // Process scales and mins
6686
+ memcpy(utmp, x[i].scales, 12);
6687
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
6688
+ const uint32_t uaux = utmp[1] & kmask1;
6689
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
6690
+ utmp[2] = uaux;
6691
+ utmp[0] &= kmask1;
6692
+
6693
+ // Sum mins * q8sums
6694
+ int32_t sumi = 0;
6695
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
6696
+ const uint8_t * m = (const uint8_t *)&utmp[2];
6697
+ for (int j = 0; j < 16; j += 2) {
6698
+ sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
6699
+ }
6700
+ sumf -= dmin * sumi;
6701
+
6702
+ int32_t sumi1 = 0;
6703
+ int32_t sumi2 = 0;
5717
6704
 
5718
- q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5719
- q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5720
- q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
6705
+ for (int j = 0; j < QK_K/64; ++j) {
6706
+ // Load 64 4-bit weights (32 bytes)
6707
+ const v128_t q4x0 = wasm_v128_load(q4);
6708
+ const v128_t q4x1 = wasm_v128_load(q4 + 16);
6709
+ q4 += 32;
5721
6710
 
5722
- const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
6711
+ // Split into low/high nibbles
6712
+ const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
6713
+ const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
6714
+ const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
6715
+ const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
6716
+
6717
+ // Load 64 8-bit values (64 bytes)
6718
+ const v128_t q8x0 = wasm_v128_load(q8);
6719
+ const v128_t q8x1 = wasm_v128_load(q8 + 16);
6720
+ const v128_t q8x2 = wasm_v128_load(q8 + 32);
6721
+ const v128_t q8x3 = wasm_v128_load(q8 + 48);
6722
+ q8 += 64;
5723
6723
 
5724
- sumi2 += vaddvq_s32(p2) * scales[2*j+1];
6724
+ // Low nibble products
6725
+ v128_t vacc1 = wasm_i32x4_dot_i16x8(
6726
+ wasm_i16x8_extend_low_i8x16(q4l0),
6727
+ wasm_i16x8_extend_low_i8x16(q8x0)
6728
+ );
6729
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
6730
+ wasm_i16x8_extend_high_i8x16(q4l0),
6731
+ wasm_i16x8_extend_high_i8x16(q8x0)
6732
+ ));
6733
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
6734
+ wasm_i16x8_extend_low_i8x16(q4l1),
6735
+ wasm_i16x8_extend_low_i8x16(q8x1)
6736
+ ));
6737
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
6738
+ wasm_i16x8_extend_high_i8x16(q4l1),
6739
+ wasm_i16x8_extend_high_i8x16(q8x1)
6740
+ ));
6741
+
6742
+ // High nibble products
6743
+ v128_t vacc2 = wasm_i32x4_dot_i16x8(
6744
+ wasm_i16x8_extend_low_i8x16(q4h0),
6745
+ wasm_i16x8_extend_low_i8x16(q8x2)
6746
+ );
6747
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
6748
+ wasm_i16x8_extend_high_i8x16(q4h0),
6749
+ wasm_i16x8_extend_high_i8x16(q8x2)
6750
+ ));
6751
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
6752
+ wasm_i16x8_extend_low_i8x16(q4h1),
6753
+ wasm_i16x8_extend_low_i8x16(q8x3)
6754
+ ));
6755
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
6756
+ wasm_i16x8_extend_high_i8x16(q4h1),
6757
+ wasm_i16x8_extend_high_i8x16(q8x3)
6758
+ ));
6759
+
6760
+ // Accumulate scaled results
6761
+ int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
6762
+ wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
6763
+ sumi1 += vacc1_sum * scales[2*j];
6764
+
6765
+ int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
6766
+ wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
6767
+ sumi2 += vacc2_sum * scales[2*j+1];
5725
6768
  }
5726
6769
 
5727
6770
  sumf += d * (sumi1 + sumi2);
5728
-
5729
6771
  }
5730
6772
 
5731
6773
  *s = sumf;
@@ -5749,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5749
6791
  utmp[2] = uaux;
5750
6792
  utmp[0] &= kmask1;
5751
6793
 
5752
- const uint8_t * restrict q4 = x[i].qs;
5753
- const int8_t * restrict q8 = y[i].qs;
6794
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5754
6796
 
5755
6797
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
5756
6798
 
@@ -5808,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5808
6850
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5809
6851
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5810
6852
 
5811
- const uint8_t * restrict q4 = x[i].qs;
5812
- const int8_t * restrict q8 = y[i].qs;
6853
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6854
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5813
6855
 
5814
6856
  memcpy(utmp, x[i].scales, 12);
5815
6857
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -5909,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5909
6951
  vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
5910
6952
  sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
5911
6953
 
5912
- const uint8_t * restrict q4 = x[i].qs;
5913
- const int8_t * restrict q8 = y[i].qs;
6954
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6955
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5914
6956
 
5915
6957
  vl = 32;
5916
6958
 
@@ -6011,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6011
7053
  vector signed int vsumi2 = v0;
6012
7054
  vector signed int vsumi3 = v0;
6013
7055
 
6014
- const uint8_t * restrict q4 = x[i].qs;
6015
- const int8_t * restrict q8 = y[i].qs;
7056
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7057
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6016
7058
 
6017
7059
  for (int j = 0; j < QK_K/64; j+=2) {
6018
7060
  __builtin_prefetch(q4, 0, 1);
@@ -6087,11 +7129,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6087
7129
  *s = vec_extract(vsumf0, 0);
6088
7130
 
6089
7131
  #elif defined __loongarch_asx
6090
- GGML_UNUSED(kmask1);
6091
- GGML_UNUSED(kmask2);
6092
- GGML_UNUSED(kmask3);
6093
-
6094
- const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
6095
7132
 
6096
7133
  __m256 acc = (__m256)__lasx_xvldi(0);
6097
7134
  __m128 acc_m = (__m128)__lsx_vldi(0);
@@ -6108,36 +7145,37 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6108
7145
  utmp[2] = uaux;
6109
7146
  utmp[0] &= kmask1;
6110
7147
 
6111
- const uint8_t * restrict q4 = x[i].qs;
6112
- const int8_t * restrict q8 = y[i].qs;
7148
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7149
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6113
7150
 
6114
- const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
7151
+ const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
7152
+ const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
7153
+ const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
6115
7154
 
6116
7155
  const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
6117
7156
  const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
6118
- const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
7157
+ const __m128i prod = lsx_madd_h(mins128, q8s);
6119
7158
  acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
6120
7159
 
6121
- const __m128i sc128 = lasx_extracti128(mins_and_scales, 0);
6122
- const __m256i scales = lasx_insertf128(sc128, sc128);
7160
+ const __m256i scales = lasx_insertf128(scales128, scales128);
6123
7161
 
6124
7162
  __m256i sumi = __lasx_xvldi(0);
6125
7163
 
6126
7164
  for (int j = 0; j < QK_K/64; ++j) {
6127
7165
 
6128
- const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
6129
- const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
7166
+ const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0);
7167
+ const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1);
6130
7168
 
6131
7169
  const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
6132
- const __m256i q4l = __lasx_xvand_v(q4bits, m4);
6133
- const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4);
7170
+ const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf);
7171
+ const __m256i q4h = __lasx_xvsrli_b(q4bits, 4);
6134
7172
 
6135
7173
  const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
6136
- __m256i p16l = lasx_maddubs_h(q4l, q8l);
7174
+ __m256i p16l = lasx_madd_h_b(q4l, q8l);
6137
7175
  p16l = lasx_madd_h(scale_l, p16l);
6138
7176
 
6139
7177
  const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
6140
- __m256i p16h = lasx_maddubs_h(q4h, q8h);
7178
+ __m256i p16h = lasx_madd_h_b(q4h, q8h);
6141
7179
  p16h = lasx_madd_h(scale_h, p16h);
6142
7180
  const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
6143
7181
 
@@ -6154,9 +7192,78 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6154
7192
  acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
6155
7193
 
6156
7194
 
6157
- ft_union fi;
6158
- fi.i = __lsx_vpickve2gr_w(acc_m, 0);
6159
- *s = hsum_float_8(acc) + fi.f ;
7195
+ *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
7196
+ #elif defined(__VXE__) || defined(__VXE2__)
7197
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
7198
+ const int32x4_t v_z = vec_splat_s32(0);
7199
+
7200
+ uint8x16_t v_x[2];
7201
+ int8x16_t v_xl[2];
7202
+ int8x16_t v_y[2];
7203
+
7204
+ float sumf = 0;
7205
+
7206
+ for (int i = 0; i < nb; ++i) {
7207
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7208
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7209
+
7210
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
7211
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
7212
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
7213
+
7214
+ memcpy(utmp, x[i].scales, 12);
7215
+
7216
+ uint32x4_t v_mins8 = { 0 };
7217
+ v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
7218
+ v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
7219
+
7220
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7221
+ utmp[0] &= kmask1;
7222
+
7223
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
7224
+
7225
+ const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
7226
+ const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
7227
+ const int32x4_t v_mins = v_minso + v_minse;
7228
+ sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
7229
+
7230
+ const uint8_t * scales = (const uint8_t *)utmp;
7231
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
7232
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
7233
+
7234
+ int32_t sumi1 = 0;
7235
+ int32_t sumi2 = 0;
7236
+
7237
+ for (int j = 0; j < QK_K/64; ++j) {
7238
+ v_x[0] = vec_xl(0 , x0);
7239
+ v_x[1] = vec_xl(16, x0);
7240
+ x0 += 32;
7241
+
7242
+ v_y[0] = vec_xl(0 , y0);
7243
+ v_y[1] = vec_xl(16, y0);
7244
+ y0 += 32;
7245
+
7246
+ v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
7247
+ v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
7248
+
7249
+ const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
7250
+ sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
7251
+
7252
+ v_y[0] = vec_xl(0 , y0);
7253
+ v_y[1] = vec_xl(16, y0);
7254
+ y0 += 32;
7255
+
7256
+ v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
7257
+ v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
7258
+
7259
+ const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
7260
+ sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
7261
+ }
7262
+
7263
+ sumf += d * (sumi1 + sumi2);
7264
+ }
7265
+
7266
+ *s = sumf;
6160
7267
  #else
6161
7268
 
6162
7269
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -6170,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6170
7277
 
6171
7278
  float sumf = 0;
6172
7279
  for (int i = 0; i < nb; ++i) {
6173
- const uint8_t * restrict q4 = x[i].qs;
6174
- const int8_t * restrict q8 = y[i].qs;
7280
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7281
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6175
7282
  memset(aux32, 0, 8*sizeof(int32_t));
6176
- int8_t * restrict a = aux8;
7283
+ int8_t * GGML_RESTRICT a = aux8;
6177
7284
  for (int j = 0; j < QK_K/64; ++j) {
6178
7285
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
6179
7286
  a += 32;
@@ -6216,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6216
7323
  #endif
6217
7324
  }
6218
7325
 
6219
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7326
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
6220
7327
  assert(n % QK_K == 0);
6221
7328
  assert(nrc == 1);
6222
7329
  UNUSED(nrc);
@@ -6224,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6224
7331
  UNUSED(by);
6225
7332
  UNUSED(bs);
6226
7333
 
6227
- const block_q5_K * restrict x = vx;
6228
- const block_q8_K * restrict y = vy;
7334
+ const block_q5_K * GGML_RESTRICT x = vx;
7335
+ const block_q8_K * GGML_RESTRICT y = vy;
6229
7336
 
6230
7337
  const int nb = n / QK_K;
6231
7338
 
@@ -6267,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6267
7374
 
6268
7375
  const uint8_t * scales = (const uint8_t *)utmp;
6269
7376
 
6270
- const uint8_t * restrict q5 = x[i].qs;
6271
- const uint8_t * restrict qh = x[i].qh;
6272
- const int8_t * restrict q8 = y[i].qs;
7377
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7378
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6273
7380
 
6274
7381
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
6275
7382
 
@@ -6314,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6314
7421
  float summs = 0.f;
6315
7422
 
6316
7423
  for (int i = 0; i < nb; ++i) {
6317
- const uint8_t * restrict q5 = x[i].qs;
6318
- const int8_t * restrict q8 = y[i].qs;
7424
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7425
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6319
7426
 
6320
7427
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6321
7428
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -6398,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6398
7505
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6399
7506
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6400
7507
 
6401
- const uint8_t * restrict q5 = x[i].qs;
6402
- const int8_t * restrict q8 = y[i].qs;
7508
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7509
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6403
7510
 
6404
7511
  memcpy(utmp, x[i].scales, 12);
6405
7512
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6482,6 +7589,118 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6482
7589
 
6483
7590
  *s = hsum_float_8(acc) + summs;
6484
7591
 
7592
+ #elif defined __wasm_simd128__
7593
+ //const uint8_t * scales = (const uint8_t*)&utmp[0];
7594
+ float sumf = 0;
7595
+
7596
+ for (int i = 0; i < nb; ++i) {
7597
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7598
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
7599
+
7600
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7601
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7602
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7603
+
7604
+ // Process scales and mins
7605
+ memcpy(utmp, x[i].scales, 12);
7606
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7607
+ const uint32_t uaux = utmp[1] & kmask1;
7608
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7609
+ utmp[2] = uaux;
7610
+ utmp[0] &= kmask1;
7611
+
7612
+ // Sum mins * q8sums
7613
+ int32_t sumi_mins = 0;
7614
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
7615
+ const uint8_t * m = (const uint8_t *)&utmp[2];
7616
+ for (int j = 0; j < 16; j += 2) {
7617
+ sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
7618
+ }
7619
+ sumf -= dmin * sumi_mins; // Correct subtraction
7620
+
7621
+ v128_t qh0 = wasm_v128_load(qh);
7622
+ v128_t qh1 = wasm_v128_load(qh + 16);
7623
+ const uint8_t * sc = (const uint8_t *)utmp;
7624
+
7625
+ int32_t sumi = 0;
7626
+
7627
+ for (int j = 0; j < QK_K/64; ++j) {
7628
+ const int shift = j * 2;
7629
+ v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift);
7630
+ v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift);
7631
+
7632
+ v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4);
7633
+ v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3);
7634
+ v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4);
7635
+ v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3);
7636
+
7637
+ v128_t q5_0 = wasm_v128_load(q5);
7638
+ v128_t q5_1 = wasm_v128_load(q5 + 16);
7639
+ q5 += 32;
7640
+
7641
+ v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0);
7642
+ v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0);
7643
+ v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1);
7644
+ v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1);
7645
+
7646
+ v128_t q8_0 = wasm_v128_load(q8);
7647
+ v128_t q8_1 = wasm_v128_load(q8 + 16);
7648
+ v128_t q8_2 = wasm_v128_load(q8 + 32);
7649
+ v128_t q8_3 = wasm_v128_load(q8 + 48);
7650
+ q8 += 64;
7651
+
7652
+ // Process low quants
7653
+ v128_t pl0 = wasm_i32x4_dot_i16x8(
7654
+ wasm_i16x8_extend_low_i8x16(q5l_0),
7655
+ wasm_i16x8_extend_low_i8x16(q8_0)
7656
+ );
7657
+ pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8(
7658
+ wasm_i16x8_extend_high_i8x16(q5l_0),
7659
+ wasm_i16x8_extend_high_i8x16(q8_0)
7660
+ ));
7661
+ v128_t pl1 = wasm_i32x4_dot_i16x8(
7662
+ wasm_i16x8_extend_low_i8x16(q5l_1),
7663
+ wasm_i16x8_extend_low_i8x16(q8_1)
7664
+ );
7665
+ pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8(
7666
+ wasm_i16x8_extend_high_i8x16(q5l_1),
7667
+ wasm_i16x8_extend_high_i8x16(q8_1)
7668
+ ));
7669
+ v128_t sum_low = wasm_i32x4_add(pl0, pl1);
7670
+
7671
+ // Process high quants
7672
+ v128_t ph0 = wasm_i32x4_dot_i16x8(
7673
+ wasm_i16x8_extend_low_i8x16(q5h_0),
7674
+ wasm_i16x8_extend_low_i8x16(q8_2)
7675
+ );
7676
+ ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8(
7677
+ wasm_i16x8_extend_high_i8x16(q5h_0),
7678
+ wasm_i16x8_extend_high_i8x16(q8_2)
7679
+ ));
7680
+ v128_t ph1 = wasm_i32x4_dot_i16x8(
7681
+ wasm_i16x8_extend_low_i8x16(q5h_1),
7682
+ wasm_i16x8_extend_low_i8x16(q8_3)
7683
+ );
7684
+ ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8(
7685
+ wasm_i16x8_extend_high_i8x16(q5h_1),
7686
+ wasm_i16x8_extend_high_i8x16(q8_3)
7687
+ ));
7688
+ v128_t sum_high = wasm_i32x4_add(ph0, ph1);
7689
+
7690
+ // Accumulate with scale factors
7691
+ int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) +
7692
+ wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3);
7693
+ int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) +
7694
+ wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3);
7695
+
7696
+ sumi += sl * sc[2*j] + sh * sc[2*j+1];
7697
+ }
7698
+
7699
+ sumf += d * sumi;
7700
+ }
7701
+
7702
+ *s = sumf;
7703
+
6485
7704
  #elif defined __riscv_v_intrinsic
6486
7705
 
6487
7706
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -6496,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6496
7715
 
6497
7716
  vl = 8;
6498
7717
 
6499
- const uint8_t * restrict q5 = x[i].qs;
6500
- const uint8_t * restrict hm = x[i].qh;
6501
- const int8_t * restrict q8 = y[i].qs;
7718
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7719
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
7720
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6502
7721
 
6503
7722
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
6504
7723
  const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -6637,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6637
7856
  vector signed int vsumi2 = v0;
6638
7857
  vector signed int vsumi3 = v0;
6639
7858
 
6640
- const uint8_t * restrict q5 = x[i].qs;
6641
- const int8_t * restrict q8 = y[i].qs;
7859
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7860
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6642
7861
 
6643
7862
  for (int j = 0; j < QK_K/64; ++j) {
6644
7863
  __builtin_prefetch(q5, 0, 1);
@@ -6704,22 +7923,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6704
7923
  *s = vec_extract(vsumf0, 0);
6705
7924
 
6706
7925
  #elif defined __loongarch_asx
6707
- GGML_UNUSED(kmask1);
6708
- GGML_UNUSED(kmask2);
6709
- GGML_UNUSED(kmask3);
6710
-
6711
- const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
6712
- const __m128i mzero = __lsx_vldi(0);
6713
- const __m256i mone = __lasx_xvreplgr2vr_b(1);
6714
7926
 
6715
7927
  __m256 acc = (__m256)__lasx_xvldi(0);
7928
+ __m128 acc_m = (__m128)__lsx_vldi(0);
6716
7929
 
6717
- float summs = 0.f;
6718
-
6719
- for (int i = 0; i < nb; ++i) {
7930
+ for (int i = 0; i < nb; ++i) {
6720
7931
 
6721
- const uint8_t * restrict q5 = x[i].qs;
6722
- const int8_t * restrict q8 = y[i].qs;
7932
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7933
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6723
7934
 
6724
7935
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6725
7936
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -6731,49 +7942,40 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6731
7942
  utmp[2] = uaux;
6732
7943
  utmp[0] &= kmask1;
6733
7944
 
6734
- const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
7945
+ const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
7946
+ const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
7947
+ const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
6735
7948
 
6736
7949
  const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
6737
7950
  const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
6738
- const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
6739
- const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero);
6740
- summs += dmin * __lsx_vpickve2gr_w(hsum, 0); //TODO check
7951
+ const __m128i prod = lsx_madd_h(mins128, q8s);
7952
+ acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
6741
7953
 
6742
- const __m128i sc128 = lasx_extracti128(mins_and_scales, 0);
6743
- const __m256i scales = lasx_insertf128(sc128, sc128);
7954
+ const __m256i scales = lasx_insertf128(scales128, scales128);
6744
7955
 
6745
7956
  const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
6746
- __m256i hmask = mone;
6747
7957
 
6748
7958
  __m256i sumi = __lasx_xvldi(0);
6749
7959
 
6750
- int bit = 0;
6751
- __m256i xvbit;
6752
-
6753
7960
  for (int j = 0; j < QK_K/64; ++j) {
6754
7961
 
6755
- const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
6756
- const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
7962
+ const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0);
7963
+ const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1);
6757
7964
 
6758
7965
  const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
6759
7966
 
6760
- xvbit = __lasx_xvreplgr2vr_h(bit++);
6761
- const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
6762
- const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
6763
- const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
6764
- hmask = __lasx_xvslli_h(hmask, 1);
6765
-
6766
- xvbit = __lasx_xvreplgr2vr_h(bit++);
6767
- const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
6768
- const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
6769
- const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
6770
- hmask = __lasx_xvslli_h(hmask, 1);
7967
+ const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf);
7968
+ const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4);
7969
+ const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef);
7970
+ const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef);
7971
+ const __m256i q5_0 = __lasx_xvor_v(q5l_0, q5h_0);
7972
+ const __m256i q5_1 = __lasx_xvor_v(q5l_1, q5h_1);
6771
7973
 
6772
7974
  const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
6773
7975
  const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
6774
7976
 
6775
- __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0);
6776
- __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1);
7977
+ __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0);
7978
+ __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1);
6777
7979
 
6778
7980
  p16_0 = lasx_madd_h(scale_0, p16_0);
6779
7981
  p16_1 = lasx_madd_h(scale_1, p16_1);
@@ -6787,8 +7989,98 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6787
7989
 
6788
7990
  }
6789
7991
 
6790
- *s = hsum_float_8(acc) + summs;
7992
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8));
7993
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
7994
+
7995
+ *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
7996
+ #elif defined(__VXE__) || defined(__VXE2__)
7997
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
7998
+ const uint8x16_t v_1m = vec_splat_u8(0x01);
7999
+ const uint8x16_t v_2m = vec_splat_u8(0x02);
8000
+
8001
+ const int32x4_t v_z = vec_splat_s32(0);
8002
+
8003
+ const uchar8x16_t v_minsm = {
8004
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
8005
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
8006
+ };
8007
+
8008
+ int8x16_t q5b[4];
8009
+ uint8x16_t q5h[4];
8010
+
8011
+ uint8x16_t v_xl[2];
8012
+ uint8x16_t v_xh[2];
8013
+ int8x16_t v_y[4];
8014
+
8015
+ float sumf = 0;
8016
+
8017
+ for (int i = 0; i < nb; ++i) {
8018
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8019
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
8020
+
8021
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8022
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
8023
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
8024
+
8025
+ memcpy(utmp, x[i].scales, 12);
8026
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
8027
+ const uint32_t uaux = utmp[1] & kmask1;
8028
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
8029
+ utmp[2] = uaux;
8030
+ utmp[0] &= kmask1;
8031
+
8032
+ const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
8033
+ const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
8034
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
8035
+
8036
+ const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
8037
+ const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
8038
+ const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
8039
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8040
+
8041
+ const uint8_t * scales = (const uint8_t *)utmp;
8042
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
8043
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8044
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8045
+
8046
+ v_xh[0] = vec_xl(0 , x0h);
8047
+ v_xh[1] = vec_xl(16, x0h);
8048
+
8049
+ int32_t sumi = 0;
8050
+ for (int j = 0; j < QK_K/64; ++j) {
8051
+ v_xl[0] = vec_xl(0 , x0l);
8052
+ v_xl[1] = vec_xl(16, x0l);
8053
+ x0l += 32;
8054
+
8055
+ v_y[0] = vec_xl(0 , y0);
8056
+ v_y[1] = vec_xl(16, y0);
8057
+ v_y[2] = vec_xl(32, y0);
8058
+ v_y[3] = vec_xl(48, y0);
8059
+ y0 += 64;
8060
+
8061
+ q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
8062
+ q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
8063
+ q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
8064
+ q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
8065
+ v_xh[0] = vec_sr(v_xh[0], 2);
8066
+ v_xh[1] = vec_sr(v_xh[1], 2);
8067
+
8068
+ q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
8069
+ q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
8070
+ q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
8071
+ q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
8072
+
8073
+ int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
8074
+ int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
8075
+
8076
+ sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
8077
+ sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
8078
+ }
6791
8079
 
8080
+ sumf += d * sumi - dmin * mins;
8081
+ }
8082
+
8083
+ *s = sumf;
6792
8084
  #else
6793
8085
 
6794
8086
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -6802,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6802
8094
 
6803
8095
  float sumf = 0;
6804
8096
  for (int i = 0; i < nb; ++i) {
6805
- const uint8_t * restrict q4 = x[i].qs;
6806
- const uint8_t * restrict hm = x[i].qh;
6807
- const int8_t * restrict q8 = y[i].qs;
8097
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
8098
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
8099
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6808
8100
  memset(aux32, 0, 8*sizeof(int32_t));
6809
- int8_t * restrict a = aux8;
8101
+ int8_t * GGML_RESTRICT a = aux8;
6810
8102
  uint8_t m = 1;
6811
8103
  for (int j = 0; j < QK_K/64; ++j) {
6812
8104
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -6853,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6853
8145
  #endif
6854
8146
  }
6855
8147
 
6856
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8148
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
6857
8149
  assert(n % QK_K == 0);
6858
8150
  assert(nrc == 1);
6859
8151
  UNUSED(nrc);
@@ -6861,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6861
8153
  UNUSED(by);
6862
8154
  UNUSED(bs);
6863
8155
 
6864
- const block_q6_K * restrict x = vx;
6865
- const block_q8_K * restrict y = vy;
8156
+ const block_q6_K * GGML_RESTRICT x = vx;
8157
+ const block_q8_K * GGML_RESTRICT y = vy;
6866
8158
 
6867
8159
  const int nb = n / QK_K;
6868
8160
 
@@ -6882,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6882
8174
 
6883
8175
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
6884
8176
 
6885
- const uint8_t * restrict q6 = x[i].ql;
6886
- const uint8_t * restrict qh = x[i].qh;
6887
- const int8_t * restrict q8 = y[i].qs;
8177
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8178
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8179
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6888
8180
 
6889
- const int8_t * restrict scale = x[i].scales;
8181
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
6890
8182
 
6891
8183
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
6892
8184
  const int8x16_t scales = vld1q_s8(scale);
@@ -6973,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6973
8265
 
6974
8266
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6975
8267
 
6976
- const uint8_t * restrict q4 = x[i].ql;
6977
- const uint8_t * restrict qh = x[i].qh;
6978
- const int8_t * restrict q8 = y[i].qs;
8268
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8269
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8270
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6979
8271
 
6980
8272
  const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
6981
8273
 
@@ -7051,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7051
8343
 
7052
8344
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7053
8345
 
7054
- const uint8_t * restrict q4 = x[i].ql;
7055
- const uint8_t * restrict qh = x[i].qh;
7056
- const int8_t * restrict q8 = y[i].qs;
8346
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8347
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8348
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7057
8349
 
7058
8350
  // handle the q6_k -32 offset separately using bsums
7059
8351
  const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
@@ -7145,6 +8437,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7145
8437
 
7146
8438
  *s = hsum_float_8(acc);
7147
8439
 
8440
+ #elif defined __wasm_simd128__
8441
+ int8_t aux8[QK_K] __attribute__((aligned(16)));
8442
+ int32_t aux32[8] __attribute__((aligned(16))) = {0};
8443
+ float sums[8] __attribute__((aligned(16))) = {0};
8444
+
8445
+ for (int i = 0; i < nb; ++i) {
8446
+ // Unpack 6-bit quantized data into aux8 (unchanged)
8447
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8448
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8449
+ int8_t * a = aux8;
8450
+ for (int j = 0; j < QK_K; j += 128) {
8451
+ for (int l = 0; l < 32; ++l) {
8452
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
8453
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
8454
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
8455
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
8456
+ }
8457
+ a += 128;
8458
+ q4 += 64;
8459
+ qh += 32;
8460
+ }
8461
+
8462
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
8463
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8464
+ v128_t acc0 = wasm_i32x4_splat(0);
8465
+ v128_t acc1 = wasm_i32x4_splat(0);
8466
+
8467
+ for (int j = 0; j < QK_K/16; ++j) {
8468
+ const int scale = x[i].scales[j];
8469
+ const v128_t vscale = wasm_i32x4_splat(scale);
8470
+
8471
+ // Load 16 elements from a and q8
8472
+ const v128_t a_vec = wasm_v128_load(a_ptr);
8473
+ const v128_t q8_vec = wasm_v128_load(q8);
8474
+
8475
+ // Process low 8 elements
8476
+ v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec);
8477
+ v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec);
8478
+ v128_t prod_low = wasm_i16x8_mul(a_low, q8_low);
8479
+ v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low);
8480
+ v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low);
8481
+
8482
+ // Process high 8 elements
8483
+ v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec);
8484
+ v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec);
8485
+ v128_t prod_high = wasm_i16x8_mul(a_high, q8_high);
8486
+ v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high);
8487
+ v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high);
8488
+
8489
+ // Scale and accumulate
8490
+ prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale);
8491
+ prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale);
8492
+ prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale);
8493
+ prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale);
8494
+
8495
+ acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo));
8496
+ acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi));
8497
+
8498
+ a_ptr += 16;
8499
+ q8 += 16;
8500
+ }
8501
+
8502
+ // Store accumulated results
8503
+ wasm_v128_store(&aux32[0], acc0);
8504
+ wasm_v128_store(&aux32[4], acc1);
8505
+
8506
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8507
+ for (int l = 0; l < 8; ++l) {
8508
+ sums[l] += d * aux32[l];
8509
+ }
8510
+ }
8511
+
8512
+ // Sum final results
8513
+ float sumf = 0;
8514
+ for (int l = 0; l < 8; ++l) {
8515
+ sumf += sums[l];
8516
+ }
8517
+ *s = sumf;
8518
+
7148
8519
  #elif defined __riscv_v_intrinsic
7149
8520
 
7150
8521
  float sumf = 0;
@@ -7152,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7152
8523
 
7153
8524
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7154
8525
 
7155
- const uint8_t * restrict q6 = x[i].ql;
7156
- const uint8_t * restrict qh = x[i].qh;
7157
- const int8_t * restrict q8 = y[i].qs;
8526
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8527
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8528
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7158
8529
 
7159
- const int8_t * restrict scale = x[i].scales;
8530
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
7160
8531
 
7161
8532
  size_t vl;
7162
8533
 
@@ -7258,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7258
8629
  vector signed int vsumi6 = v0;
7259
8630
  vector signed int vsumi7 = v0;
7260
8631
 
7261
- const uint8_t * restrict q6 = x[i].ql;
7262
- const uint8_t * restrict qh = x[i].qh;
7263
- const int8_t * restrict qs = x[i].scales;
7264
- const int8_t * restrict q8 = y[i].qs;
8632
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8633
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8634
+ const int8_t * GGML_RESTRICT qs = x[i].scales;
8635
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7265
8636
 
7266
8637
  for (int j = 0; j < QK_K/128; ++j) {
7267
8638
  __builtin_prefetch(q6, 0, 0);
@@ -7369,8 +8740,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7369
8740
 
7370
8741
  #elif defined __loongarch_asx
7371
8742
 
7372
- const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
7373
- const __m256i m2 = __lasx_xvreplgr2vr_b(3);
7374
8743
  const __m256i m32s = __lasx_xvreplgr2vr_b(32);
7375
8744
 
7376
8745
  __m256 acc = (__m256)__lasx_xvldi(0);
@@ -7379,62 +8748,46 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7379
8748
 
7380
8749
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7381
8750
 
7382
- const uint8_t * restrict q4 = x[i].ql;
7383
- const uint8_t * restrict qh = x[i].qh;
7384
- const int8_t * restrict q8 = y[i].qs;
8751
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8752
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8753
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7385
8754
 
7386
- const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0);
8755
+ const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
8756
+ const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
8757
+ const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
7387
8758
 
7388
8759
  __m256i sumi = __lasx_xvldi(0);
7389
8760
 
7390
- int is = 0;
7391
-
7392
8761
  for (int j = 0; j < QK_K/128; ++j) {
7393
8762
 
7394
- const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0));
7395
- const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1));
7396
- const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2));
7397
- const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3));
7398
- is += 4;
7399
-
7400
8763
  const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
7401
8764
  const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
7402
8765
  const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
7403
8766
 
7404
- const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4);
7405
- const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4);
7406
- const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4);
7407
- const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4);
8767
+ const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4);
8768
+ const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2);
8769
+ const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4);
8770
+ const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2);
7408
8771
 
7409
- const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0);
7410
- const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1);
7411
- const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2);
7412
- const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3);
8772
+ const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0);
8773
+ const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1);
8774
+ const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2);
8775
+ const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3);
7413
8776
 
7414
8777
  const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
7415
8778
  const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
7416
8779
  const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
7417
8780
  const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
7418
8781
 
7419
- __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0);
7420
- __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1);
7421
- __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2);
7422
- __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3);
7423
-
7424
- __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
7425
- __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
7426
- __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2);
7427
- __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3);
8782
+ __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0);
8783
+ __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1);
8784
+ __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2);
8785
+ __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3);
7428
8786
 
7429
- p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
7430
- p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
7431
- p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
7432
- p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
7433
-
7434
- p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
7435
- p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
7436
- p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2);
7437
- p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3);
8787
+ p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
8788
+ p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
8789
+ p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
8790
+ p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
7438
8791
 
7439
8792
  sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
7440
8793
  sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
@@ -7444,7 +8797,130 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7444
8797
  }
7445
8798
 
7446
8799
  *s = hsum_float_8(acc);
8800
+ #elif defined(__VXE__) || defined(__VXE2__)
8801
+ float sum = 0;
8802
+
8803
+ // Lower 4-bit and upper 2-bit masks
8804
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
8805
+ const uint8x16_t v_um = vec_splat_u8(0x03);
8806
+
8807
+ const int32x4_t v_z = vec_splat_s32(0);
8808
+
8809
+ int8x16_t q6b[4];
8810
+ uint8x16_t q6h[4];
8811
+
8812
+ uint8x16_t v_xl[4];
8813
+ uint8x16_t v_xh[2];
8814
+ int8x16_t v_y[4];
8815
+
8816
+ for (int i = 0; i < nb; ++i) {
8817
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8818
+
8819
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
8820
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8821
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8822
+
8823
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8824
+
8825
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8826
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
8827
+
8828
+ const int8x16_t v_scale = vec_xl(0, scale);
8829
+ const int16x8_t v_scalel = vec_unpackh(v_scale);
8830
+ const int16x8_t v_scaleh = vec_unpackl(v_scale);
8831
+
8832
+ const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
8833
+ const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
8834
+ const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
8835
+ const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
8836
+ const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
7447
8837
 
8838
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8839
+
8840
+ int32_t isum = 0;
8841
+ for (int j = 0; j < QK_K/128; ++j) {
8842
+ // Load model upper 2 bits
8843
+ v_xh[0] = vec_xl(0 , x0h);
8844
+ v_xh[1] = vec_xl(16, x0h);
8845
+ x0h += 32;
8846
+
8847
+ // Load model lower 4 bits
8848
+ v_xl[0] = vec_xl(0 , x0l);
8849
+ v_xl[1] = vec_xl(16, x0l);
8850
+ v_xl[2] = vec_xl(32, x0l);
8851
+ v_xl[3] = vec_xl(48, x0l);
8852
+ x0l += 64;
8853
+
8854
+ // Load activation quants
8855
+ v_y[0] = vec_xl(0 , y0);
8856
+ v_y[1] = vec_xl(16, y0);
8857
+ v_y[2] = vec_xl(32, y0);
8858
+ v_y[3] = vec_xl(48, y0);
8859
+ y0 += 64;
8860
+
8861
+ q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
8862
+ q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
8863
+ uint8x16_t shifted = vec_sr(v_xh[0], 2);
8864
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
8865
+ shifted = vec_sr(v_xh[1], 2);
8866
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
8867
+
8868
+ q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
8869
+ q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
8870
+ q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
8871
+ q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
8872
+
8873
+ int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
8874
+ int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
8875
+ int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
8876
+ int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
8877
+
8878
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
8879
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
8880
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
8881
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
8882
+
8883
+ scale += 4;
8884
+
8885
+
8886
+ // Load activation quants
8887
+ v_y[0] = vec_xl(0 , y0);
8888
+ v_y[1] = vec_xl(16, y0);
8889
+ v_y[2] = vec_xl(32, y0);
8890
+ v_y[3] = vec_xl(48, y0);
8891
+ y0 += 64;
8892
+
8893
+ shifted = vec_sr(v_xh[0], 4);
8894
+ q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
8895
+ shifted = vec_sr(v_xh[1], 4);
8896
+ q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
8897
+ shifted = vec_sr(v_xh[0], 6);
8898
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
8899
+ shifted = vec_sr(v_xh[1], 6);
8900
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
8901
+
8902
+ q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
8903
+ q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
8904
+ q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
8905
+ q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
8906
+
8907
+ summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
8908
+ summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
8909
+ summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
8910
+ summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
8911
+
8912
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
8913
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
8914
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
8915
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
8916
+
8917
+ scale += 4;
8918
+ }
8919
+
8920
+ sum += d_all * y[i].d * (isum - 32 * mins);
8921
+ }
8922
+
8923
+ *s = sum;
7448
8924
  #else
7449
8925
 
7450
8926
  int8_t aux8[QK_K];
@@ -7455,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7455
8931
 
7456
8932
  float sumf = 0;
7457
8933
  for (int i = 0; i < nb; ++i) {
7458
- const uint8_t * restrict q4 = x[i].ql;
7459
- const uint8_t * restrict qh = x[i].qh;
7460
- const int8_t * restrict q8 = y[i].qs;
8934
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8935
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8936
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7461
8937
  memset(aux32, 0, 8*sizeof(int32_t));
7462
- int8_t * restrict a = aux8;
8938
+ int8_t * GGML_RESTRICT a = aux8;
7463
8939
  for (int j = 0; j < QK_K; j += 128) {
7464
8940
  for (int l = 0; l < 32; ++l) {
7465
8941
  a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -7527,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = {
7527
9003
  };
7528
9004
  #endif
7529
9005
 
7530
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9006
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
7531
9007
  assert(n % QK_K == 0);
7532
9008
  assert(nrc == 1);
7533
9009
  UNUSED(nrc);
@@ -7535,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7535
9011
  UNUSED(by);
7536
9012
  UNUSED(bs);
7537
9013
 
7538
- const block_iq2_xxs * restrict x = vx;
7539
- const block_q8_K * restrict y = vy;
9014
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
9015
+ const block_q8_K * GGML_RESTRICT y = vy;
7540
9016
 
7541
9017
  const int nb = n / QK_K;
7542
9018
 
@@ -7554,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7554
9030
  float sumf = 0;
7555
9031
  for (int i = 0; i < nb; ++i) {
7556
9032
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7557
- const uint16_t * restrict q2 = x[i].qs;
7558
- const int8_t * restrict q8 = y[i].qs;
9033
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9034
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7559
9035
  float sumf1 = 0, sumf2 = 0;
7560
9036
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7561
9037
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -7591,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7591
9067
  __m256 accumf = _mm256_setzero_ps();
7592
9068
  for (int i = 0; i < nb; ++i) {
7593
9069
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7594
- const uint16_t * restrict q2 = x[i].qs;
7595
- const int8_t * restrict q8 = y[i].qs;
9070
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9071
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7596
9072
  __m256i sumi1 = _mm256_setzero_si256();
7597
9073
  __m256i sumi2 = _mm256_setzero_si256();
7598
9074
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -7632,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7632
9108
  __m256 accumf = _mm256_setzero_ps();
7633
9109
  for (int i = 0; i < nb; ++i) {
7634
9110
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7635
- const uint16_t * restrict q2 = x[i].qs;
7636
- const int8_t * restrict q8 = y[i].qs;
9111
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9112
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7637
9113
  __m128i sumi1_0 = _mm_setzero_si128();
7638
9114
  __m128i sumi1_1 = _mm_setzero_si128();
7639
9115
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -7697,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7697
9173
  vector signed int vsumi2 = v0;
7698
9174
  vector signed int vsumi3 = v0;
7699
9175
 
7700
- const uint16_t * restrict q2 = x[i].qs;
7701
- const int8_t * restrict q8 = y[i].qs;
9176
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9177
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7702
9178
 
7703
9179
  for (int j = 0; j < QK_K/32; j += 2) {
7704
9180
  __builtin_prefetch(q2, 0, 1);
@@ -7774,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7774
9250
  __m256 accumf = (__m256)__lasx_xvldi(0);
7775
9251
  for (int i = 0; i < nb; ++i) {
7776
9252
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7777
- const uint16_t * restrict q2 = x[i].qs;
7778
- const int8_t * restrict q8 = y[i].qs;
9253
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9254
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7779
9255
  __m256i sumi1 = __lasx_xvldi(0);
7780
9256
  __m256i sumi2 = __lasx_xvldi(0);
7781
9257
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -7805,7 +9281,57 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7805
9281
  }
7806
9282
 
7807
9283
  *s = 0.125f * hsum_float_8(accumf);
7808
-
9284
+ //#elif defined(__VXE__) || defined(__VXE2__)
9285
+ // const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9286
+ //
9287
+ // uint32_t aux32[4];
9288
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
9289
+ //
9290
+ // float sumf = 0;
9291
+ //
9292
+ // for (int i = 0; i < nb; ++i) {
9293
+ // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9294
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9295
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
9296
+ //
9297
+ // float sumf1 = 0, sumf2 = 0;
9298
+ //
9299
+ // for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
9300
+ // int8x16_t q8b0 = vec_xl( 0, q8);
9301
+ // int8x16_t qb81 = vec_xl(16, q8);
9302
+ // int8x16_t q8b2 = vec_xl(32, q8);
9303
+ // int8x16_t q8b3 = vec_xl(48, q8);
9304
+ // q8 += 64;
9305
+ //
9306
+ // memcpy(aux32, q2, 4 * sizeof(uint32_t));
9307
+ // q2 += 8;
9308
+ //
9309
+ // int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
9310
+ // int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
9311
+ // int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
9312
+ // int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
9313
+ //
9314
+ // int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
9315
+ // int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
9316
+ // int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
9317
+ // int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
9318
+ //
9319
+ // q2u0 = vec_mul(q2u0, q2s0);
9320
+ // q2u1 = vec_mul(q2u1, q2s1);
9321
+ // q2u2 = vec_mul(q2u2, q2s2);
9322
+ // q2u3 = vec_mul(q2u3, q2s3);
9323
+ //
9324
+ // const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
9325
+ // const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
9326
+ //
9327
+ // sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
9328
+ // sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
9329
+ // }
9330
+ //
9331
+ // sumf += d * (sumf1 + sumf2);
9332
+ // }
9333
+ //
9334
+ // *s = 0.25f * sumf;
7809
9335
  #else
7810
9336
 
7811
9337
  uint32_t aux32[2];
@@ -7814,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7814
9340
  float sumf = 0.f;
7815
9341
  for (int i = 0; i < nb; ++i) {
7816
9342
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7817
- const uint16_t * restrict q2 = x[i].qs;
7818
- const int8_t * restrict q8 = y[i].qs;
9343
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9344
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7819
9345
  int32_t bsum = 0;
7820
9346
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
7821
9347
  memcpy(aux32, q2, 2*sizeof(uint32_t));
@@ -7838,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
7838
9364
  #endif
7839
9365
  }
7840
9366
 
7841
- void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9367
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
7842
9368
  assert(n % QK_K == 0);
7843
9369
  assert(nrc == 1);
7844
9370
  UNUSED(nrc);
@@ -7846,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
7846
9372
  UNUSED(by);
7847
9373
  UNUSED(bs);
7848
9374
 
7849
- const block_iq2_xs * restrict x = vx;
7850
- const block_q8_K * restrict y = vy;
9375
+ const block_iq2_xs * GGML_RESTRICT x = vx;
9376
+ const block_q8_K * GGML_RESTRICT y = vy;
7851
9377
 
7852
9378
  const int nb = n / QK_K;
7853
9379
 
@@ -7864,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
7864
9390
  float sumf = 0;
7865
9391
  for (int i = 0; i < nb; ++i) {
7866
9392
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7867
- const uint16_t * restrict q2 = x[i].qs;
7868
- const int8_t * restrict q8 = y[i].qs;
9393
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9394
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7869
9395
  const uint8x8_t scales8 = vld1_u8(x[i].scales);
7870
9396
  const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
7871
9397
  const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
@@ -7942,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
7942
9468
  __m256 accumf = _mm256_setzero_ps();
7943
9469
  for (int i = 0; i < nb; ++i) {
7944
9470
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7945
- const uint16_t * restrict q2 = x[i].qs;
7946
- const int8_t * restrict q8 = y[i].qs;
9471
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9472
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7947
9473
 
7948
9474
  memcpy(&aux64, x[i].scales, 8);
7949
9475
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -8063,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8063
9589
  __m256 accumf = _mm256_setzero_ps();
8064
9590
  for (int i = 0; i < nb; ++i) {
8065
9591
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8066
- const uint16_t * restrict q2 = x[i].qs;
8067
- const int8_t * restrict q8 = y[i].qs;
9592
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9593
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8068
9594
 
8069
9595
  memcpy(&aux64, x[i].scales, 8);
8070
9596
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -8218,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8218
9744
  __m256 accumf = (__m256)__lasx_xvldi(0);
8219
9745
  for (int i = 0; i < nb; ++i) {
8220
9746
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8221
- const uint16_t * restrict q2 = x[i].qs;
8222
- const int8_t * restrict q8 = y[i].qs;
9747
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9748
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8223
9749
 
8224
9750
  memcpy(&aux64, x[i].scales, 8);
8225
9751
  __m128i stmp = __lsx_vreplgr2vr_d(aux64);
@@ -8316,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8316
9842
  vector signed int vsumi2 = v0;
8317
9843
  vector signed int vsumi3 = v0;
8318
9844
 
8319
- const uint16_t * restrict q2 = x[i].qs;
8320
- const uint8_t * restrict sc = x[i].scales;
8321
- const int8_t * restrict q8 = y[i].qs;
9845
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9846
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9847
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8322
9848
 
8323
9849
  for (int j = 0; j < QK_K/64; ++j) {
8324
9850
  __builtin_prefetch(q2, 0, 1);
@@ -8388,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8388
9914
  float sumf = 0.f;
8389
9915
  for (int i = 0; i < nb; ++i) {
8390
9916
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8391
- const uint16_t * restrict q2 = x[i].qs;
8392
- const uint8_t * restrict sc = x[i].scales;
8393
- const int8_t * restrict q8 = y[i].qs;
9917
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9918
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9919
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8394
9920
  int32_t bsum = 0;
8395
9921
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
8396
9922
  const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
@@ -8423,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8423
9949
  #endif
8424
9950
  }
8425
9951
 
8426
- void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9952
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8427
9953
  assert(n % QK_K == 0);
8428
9954
  assert(nrc == 1);
8429
9955
  UNUSED(nrc);
@@ -8431,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8431
9957
  UNUSED(by);
8432
9958
  UNUSED(bs);
8433
9959
 
8434
- const block_iq2_s * restrict x = vx;
8435
- const block_q8_K * restrict y = vy;
9960
+ const block_iq2_s * GGML_RESTRICT x = vx;
9961
+ const block_q8_K * GGML_RESTRICT y = vy;
8436
9962
 
8437
9963
  const int nb = n / QK_K;
8438
9964
 
@@ -8458,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8458
9984
 
8459
9985
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8460
9986
 
8461
- const uint8_t * restrict qs = x[i].qs;
8462
- const uint8_t * restrict qh = x[i].qh;
8463
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
8464
- const int8_t * restrict q8 = y[i].qs;
9987
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
9988
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
9989
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
9990
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8465
9991
 
8466
9992
  int sumi1 = 0, sumi2 = 0;
8467
9993
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -8532,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8532
10058
  __m256 accumf = _mm256_setzero_ps();
8533
10059
  for (int i = 0; i < nb; ++i) {
8534
10060
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8535
- const uint8_t * restrict qs = x[i].qs;
8536
- const uint8_t * restrict qh = x[i].qh;
8537
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
8538
- const int8_t * restrict q8 = y[i].qs;
10061
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10062
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10063
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10064
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8539
10065
 
8540
10066
  memcpy(&aux64, x[i].scales, 8);
8541
10067
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -8605,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8605
10131
  __m256 accumf = _mm256_setzero_ps();
8606
10132
  for (int i = 0; i < nb; ++i) {
8607
10133
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8608
- const uint8_t * restrict qs = x[i].qs;
8609
- const uint8_t * restrict qh = x[i].qh;
8610
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
8611
- const int8_t * restrict q8 = y[i].qs;
10134
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10135
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10136
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10137
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8612
10138
 
8613
10139
  memcpy(&aux64, x[i].scales, 8);
8614
10140
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -8703,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8703
10229
  vector signed int vsumi2 = v0;
8704
10230
  vector signed int vsumi3 = v0;
8705
10231
 
8706
- const uint8_t * restrict q2 = x[i].qs;
8707
- const uint8_t * restrict qh = x[i].qh;
8708
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
8709
- const uint8_t * restrict sc = x[i].scales;
8710
- const int8_t * restrict q8 = y[i].qs;
10232
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
10233
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10234
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10235
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
10236
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8711
10237
 
8712
10238
  for (int j = 0; j < QK_K/32; j += 2) {
8713
10239
  __builtin_prefetch(q2, 0, 1);
@@ -8804,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8804
10330
  __m256 accumf = (__m256)__lasx_xvldi(0);
8805
10331
  for (int i = 0; i < nb; ++i) {
8806
10332
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8807
- const uint8_t * restrict qs = x[i].qs;
8808
- const uint8_t * restrict qh = x[i].qh;
8809
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
8810
- const int8_t * restrict q8 = y[i].qs;
10333
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10334
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10335
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10336
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8811
10337
 
8812
10338
  __m128i tmp1;
8813
10339
  memcpy(&aux64, x[i].scales, 8);
@@ -8901,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8901
10427
 
8902
10428
  }
8903
10429
 
8904
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10430
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8905
10431
  assert(n % QK_K == 0);
8906
10432
  assert(nrc == 1);
8907
10433
  UNUSED(nrc);
@@ -8909,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8909
10435
  UNUSED(by);
8910
10436
  UNUSED(bs);
8911
10437
 
8912
- const block_iq3_xxs * restrict x = vx;
8913
- const block_q8_K * restrict y = vy;
10438
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
10439
+ const block_q8_K * GGML_RESTRICT y = vy;
8914
10440
 
8915
10441
  const int nb = n / QK_K;
8916
10442
 
@@ -8926,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8926
10452
  float sumf = 0;
8927
10453
  for (int i = 0; i < nb; ++i) {
8928
10454
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8929
- const uint8_t * restrict q3 = x[i].qs;
8930
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
8931
- const int8_t * restrict q8 = y[i].qs;
10455
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10456
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10457
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8932
10458
  float sumf1 = 0, sumf2 = 0;
8933
10459
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8934
10460
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -8964,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8964
10490
  __m256 accumf = _mm256_setzero_ps();
8965
10491
  for (int i = 0; i < nb; ++i) {
8966
10492
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8967
- const uint8_t * restrict q3 = x[i].qs;
8968
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
8969
- const int8_t * restrict q8 = y[i].qs;
10493
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10494
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10495
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8970
10496
  __m256i sumi1 = _mm256_setzero_si256();
8971
10497
  __m256i sumi2 = _mm256_setzero_si256();
8972
10498
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9009,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9009
10535
  __m256 accumf = _mm256_setzero_ps();
9010
10536
  for (int i = 0; i < nb; ++i) {
9011
10537
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9012
- const uint8_t * restrict q3 = x[i].qs;
9013
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9014
- const int8_t * restrict q8 = y[i].qs;
10538
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10539
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10540
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9015
10541
  __m128i sumi1_0 = _mm_setzero_si128();
9016
10542
  __m128i sumi1_1 = _mm_setzero_si128();
9017
10543
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -9078,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9078
10604
  vector signed int vsumi2 = v0;
9079
10605
  vector signed int vsumi3 = v0;
9080
10606
 
9081
- const uint8_t * restrict q3 = x[i].qs;
9082
- const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
9083
- const int8_t * restrict q8 = y[i].qs;
10607
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10608
+ const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
10609
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9084
10610
 
9085
10611
  #pragma GCC unroll 1
9086
10612
  for (int j = 0; j < QK_K/32; j += 2) {
@@ -9152,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9152
10678
  __m256 accumf = (__m256)__lasx_xvldi(0);
9153
10679
  for (int i = 0; i < nb; ++i) {
9154
10680
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9155
- const uint8_t * restrict q3 = x[i].qs;
9156
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9157
- const int8_t * restrict q8 = y[i].qs;
10681
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10682
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9158
10684
  __m256i sumi1 = __lasx_xvldi(0);
9159
10685
  __m256i sumi2 = __lasx_xvldi(0);
9160
10686
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9197,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9197
10723
  float sumf = 0.f;
9198
10724
  for (int i = 0; i < nb; ++i) {
9199
10725
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9200
- const uint8_t * restrict q3 = x[i].qs;
9201
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9202
- const int8_t * restrict q8 = y[i].qs;
10726
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10727
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10728
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9203
10729
  int32_t bsum = 0;
9204
10730
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9205
10731
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
@@ -9224,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9224
10750
  #endif
9225
10751
  }
9226
10752
 
9227
- void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10753
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9228
10754
  assert(n % QK_K == 0);
9229
10755
  assert(nrc == 1);
9230
10756
  UNUSED(nrc);
@@ -9232,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9232
10758
  UNUSED(by);
9233
10759
  UNUSED(bs);
9234
10760
 
9235
- const block_iq3_s * restrict x = vx;
9236
- const block_q8_K * restrict y = vy;
10761
+ const block_iq3_s * GGML_RESTRICT x = vx;
10762
+ const block_q8_K * GGML_RESTRICT y = vy;
9237
10763
 
9238
10764
  const int nb = n / QK_K;
9239
10765
 
@@ -9270,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9270
10796
  float sumf = 0;
9271
10797
  for (int i = 0; i < nb; ++i) {
9272
10798
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9273
- const uint8_t * restrict qs = x[i].qs;
9274
- const uint8_t * restrict qh = x[i].qh;
9275
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9276
- const int8_t * restrict q8 = y[i].qs;
10799
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10800
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10801
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10802
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9277
10803
 
9278
10804
  memcpy(scales32, x[i].scales, 4);
9279
10805
  scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
@@ -9352,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9352
10878
  __m256 accumf = _mm256_setzero_ps();
9353
10879
  for (int i = 0; i < nb; ++i) {
9354
10880
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9355
- const uint8_t * restrict qs = x[i].qs;
9356
- const uint8_t * restrict qh = x[i].qh;
9357
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9358
- const int8_t * restrict q8 = y[i].qs;
10881
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10882
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10883
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10884
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9359
10885
  __m256i sumi1 = _mm256_setzero_si256();
9360
10886
  __m256i sumi2 = _mm256_setzero_si256();
9361
10887
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9437,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9437
10963
  __m256 accumf = _mm256_setzero_ps();
9438
10964
  for (int i = 0; i < nb; ++i) {
9439
10965
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9440
- const uint8_t * restrict qs = x[i].qs;
9441
- const uint8_t * restrict qh = x[i].qh;
9442
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9443
- const int8_t * restrict q8 = y[i].qs;
10966
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10967
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10968
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10969
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9444
10970
  __m128i sumi1_0 = _mm_setzero_si128();
9445
10971
  __m128i sumi1_1 = _mm_setzero_si128();
9446
10972
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -9538,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9538
11064
  vector float vyd = vec_splats(y[i].d);
9539
11065
  vector float vd = vec_mul(vxd, vyd);
9540
11066
 
9541
- const uint8_t * restrict q3 = x[i].qs;
9542
- const uint8_t * restrict qh = x[i].qh;
9543
- const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
9544
- const uint8_t * restrict sc = x[i].scales;
9545
- const int8_t * restrict q8 = y[i].qs;
11067
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
11068
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11069
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
11070
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
11071
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9546
11072
 
9547
11073
  vector signed int vsumi0 = v0;
9548
11074
  vector signed int vsumi1 = v0;
@@ -9649,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9649
11175
  __m256 accumf = (__m256)__lasx_xvldi(0);
9650
11176
  for (int i = 0; i < nb; ++i) {
9651
11177
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9652
- const uint8_t * restrict qs = x[i].qs;
9653
- const uint8_t * restrict qh = x[i].qh;
9654
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9655
- const int8_t * restrict q8 = y[i].qs;
11178
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11179
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11180
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
11181
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9656
11182
  __m256i sumi1 = __lasx_xvldi(0);
9657
11183
  __m256i sumi2 = __lasx_xvldi(0);
9658
11184
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9710,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9710
11236
  float sumf = 0.f;
9711
11237
  for (int i = 0; i < nb; ++i) {
9712
11238
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9713
- const uint8_t * restrict qs = x[i].qs;
9714
- const uint8_t * restrict qh = x[i].qh;
9715
- const uint8_t * restrict signs = x[i].signs;
9716
- const int8_t * restrict q8 = y[i].qs;
11239
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11240
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11241
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
11242
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9717
11243
  int32_t bsum = 0;
9718
11244
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9719
11245
  const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
@@ -9759,17 +11285,13 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
9759
11285
  }
9760
11286
  #elif defined(__loongarch_asx)
9761
11287
  static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
9762
- const __m256i ax = __lasx_xvsigncov_b(x, x);
9763
- const __m256i sy = __lasx_xvsigncov_b(x, y);
9764
- __m256i tmp1, tmp2, tmp3;
9765
- tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy);
9766
- tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy);
9767
- tmp3 = __lasx_xvadd_h(tmp1, tmp2);
9768
- return __lasx_xvsat_h(tmp3, 15);
11288
+ const __m256i a = __lasx_xvmulwev_h_b(x, y);
11289
+ const __m256i b = __lasx_xvmulwod_h_b(x, y);
11290
+ return __lasx_xvadd_h(a, b);
9769
11291
  }
9770
11292
  #endif
9771
11293
 
9772
- void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11294
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9773
11295
  assert(n % QK_K == 0);
9774
11296
  assert(nrc == 1);
9775
11297
  UNUSED(nrc);
@@ -9777,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9777
11299
  UNUSED(by);
9778
11300
  UNUSED(bs);
9779
11301
 
9780
- const block_iq1_s * restrict x = vx;
9781
- const block_q8_K * restrict y = vy;
11302
+ const block_iq1_s * GGML_RESTRICT x = vx;
11303
+ const block_q8_K * GGML_RESTRICT y = vy;
9782
11304
 
9783
11305
  const int nb = n / QK_K;
9784
11306
 
@@ -9840,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9840
11362
  __m256i sumi = _mm256_setzero_si256();
9841
11363
  int sumi1 = 0;
9842
11364
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11365
+ #ifdef __BMI2__
11366
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
11367
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
11368
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11369
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11370
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11371
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11372
+ #else
9843
11373
  const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
9844
11374
  iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
9845
11375
  const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
9846
11376
  iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11377
+ #endif
9847
11378
  qs += 8;
9848
11379
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9849
11380
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
@@ -9936,10 +11467,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9936
11467
  vector signed int vsumi3 = vec_splats((int32_t)0);
9937
11468
  vector signed int vsumi8 = vec_splats((int32_t)0);
9938
11469
 
9939
- const uint8_t * restrict q1 = x[i].qs;
9940
- const uint16_t * restrict qh = x[i].qh;
9941
- const int8_t * restrict q8 = y[i].qs;
9942
- const int16_t * restrict qs = y[i].bsums;
11470
+ const uint8_t * GGML_RESTRICT q1 = x[i].qs;
11471
+ const uint16_t * GGML_RESTRICT qh = x[i].qh;
11472
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11473
+ const int16_t * GGML_RESTRICT qs = y[i].bsums;
9943
11474
 
9944
11475
  for (int j = 0; j < QK_K/32; j += 2) {
9945
11476
  __builtin_prefetch(q1, 0, 1);
@@ -10100,7 +11631,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10100
11631
  #endif
10101
11632
  }
10102
11633
 
10103
- void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11634
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10104
11635
  assert(n % QK_K == 0);
10105
11636
  assert(nrc == 1);
10106
11637
  UNUSED(nrc);
@@ -10108,8 +11639,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10108
11639
  UNUSED(by);
10109
11640
  UNUSED(bs);
10110
11641
 
10111
- const block_iq1_m * restrict x = vx;
10112
- const block_q8_K * restrict y = vy;
11642
+ const block_iq1_m * GGML_RESTRICT x = vx;
11643
+ const block_q8_K * GGML_RESTRICT y = vy;
10113
11644
 
10114
11645
  const int nb = n / QK_K;
10115
11646
 
@@ -10189,6 +11720,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10189
11720
 
10190
11721
  const __m256i mask = _mm256_set1_epi16(0x7);
10191
11722
  const __m256i mone = _mm256_set1_epi16(1);
11723
+ const __m256i mone8 = _mm256_set1_epi8(1);
11724
+ const __m256i mtwo8 = _mm256_set1_epi8(2);
11725
+ // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
11726
+ const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
10192
11727
 
10193
11728
  __m256 accum1 = _mm256_setzero_ps();
10194
11729
  __m256 accum2 = _mm256_setzero_ps();
@@ -10200,10 +11735,33 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10200
11735
  const uint16_t * sc = (const uint16_t *)x[i].scales;
10201
11736
 
10202
11737
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
11738
+ // Extract 3-bit scales (16 values)
11739
+ __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
11740
+ scales = _mm256_srlv_epi64(scales, scales_shift);
11741
+ scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
11742
+
11743
+ // Indices to repeat each scale 8 times.
11744
+ __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
11745
+ __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
10203
11746
 
10204
11747
  __m256i sumi1 = _mm256_setzero_si256();
10205
11748
  __m256i sumi2 = _mm256_setzero_si256();
10206
11749
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11750
+ #ifdef __BMI2__
11751
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
11752
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
11753
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
11754
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
11755
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11756
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11757
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11758
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11759
+
11760
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
11761
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
11762
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
11763
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
11764
+ #else
10207
11765
  const __m256i q1b_1 = _mm256_set_epi64x(
10208
11766
  iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
10209
11767
  iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
@@ -10212,11 +11770,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10212
11770
  iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
10213
11771
  iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
10214
11772
  );
10215
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10216
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10217
-
10218
- const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
10219
- const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
10220
11773
 
10221
11774
  const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
10222
11775
  qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
@@ -10226,15 +11779,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10226
11779
  qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
10227
11780
  qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
10228
11781
  qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11782
+ #endif
11783
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11784
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10229
11785
 
10230
- const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
10231
- const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
11786
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11787
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
11788
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
11789
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
10232
11790
 
10233
- __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
10234
- __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
11791
+ __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
11792
+ __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
11793
+
11794
+ scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
11795
+ scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
10235
11796
 
10236
- scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
10237
- scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
10238
11797
  const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
10239
11798
  const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
10240
11799
  const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
@@ -10390,7 +11949,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10390
11949
  #endif
10391
11950
  }
10392
11951
 
10393
- void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11952
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10394
11953
  assert(nrc == 1);
10395
11954
  UNUSED(nrc);
10396
11955
  UNUSED(bx);
@@ -10399,8 +11958,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10399
11958
  assert(n % QK4_NL == 0);
10400
11959
  static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
10401
11960
 
10402
- const block_iq4_nl * restrict x = vx;
10403
- const block_q8_0 * restrict y = vy;
11961
+ const block_iq4_nl * GGML_RESTRICT x = vx;
11962
+ const block_q8_0 * GGML_RESTRICT y = vy;
10404
11963
 
10405
11964
  const int nb = n / QK4_NL;
10406
11965
 
@@ -10570,6 +12129,27 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10570
12129
 
10571
12130
  sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
10572
12131
 
12132
+ #elif defined(__VXE__) || defined(__VXE2__)
12133
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
12134
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
12135
+
12136
+ for (; ib < nb; ++ib) {
12137
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
12138
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
12139
+
12140
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
12141
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
12142
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
12143
+
12144
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
12145
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
12146
+
12147
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
12148
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
12149
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
12150
+
12151
+ sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
12152
+ }
10573
12153
  #endif
10574
12154
  for (; ib < nb; ++ib) {
10575
12155
  const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
@@ -10583,7 +12163,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10583
12163
  *s = sumf;
10584
12164
  }
10585
12165
 
10586
- void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
12166
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10587
12167
  assert(nrc == 1);
10588
12168
  UNUSED(nrc);
10589
12169
  UNUSED(bx);
@@ -10591,8 +12171,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10591
12171
  UNUSED(bs);
10592
12172
  assert(n % QK_K == 0);
10593
12173
 
10594
- const block_iq4_xs * restrict x = vx;
10595
- const block_q8_K * restrict y = vy;
12174
+ const block_iq4_xs * GGML_RESTRICT x = vx;
12175
+ const block_q8_K * GGML_RESTRICT y = vy;
10596
12176
 
10597
12177
  const int nb = n / QK_K;
10598
12178
 
@@ -10749,9 +12329,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10749
12329
 
10750
12330
  uint16_t h = x[ibl].scales_h;
10751
12331
 
10752
- const uint8_t * restrict q4 = x[ibl].qs;
10753
- const uint8_t * restrict sc = x[ibl].scales_l;
10754
- const int8_t * restrict q8 = y[ibl].qs;
12332
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12333
+ const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
12334
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
10755
12335
 
10756
12336
  for (int ib = 0; ib < QK_K/64; ib ++ ) {
10757
12337
  __builtin_prefetch(q4, 0, 1);
@@ -10815,67 +12395,31 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10815
12395
  #elif defined(__loongarch_asx)
10816
12396
 
10817
12397
  const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
10818
- const __m128i m4b = __lsx_vreplgr2vr_b(0x0f);
10819
12398
 
10820
12399
  __m256 accum = (__m256)__lasx_xvldi(0);
10821
- __m256i tmp1;
10822
- __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask;
10823
12400
 
10824
- mask_8f = __lsx_vreplgr2vr_b(0x8f);
10825
12401
  for (int ibl = 0; ibl < nb; ++ibl) {
10826
12402
  const uint8_t * qs = x[ibl].qs;
10827
12403
  const int8_t * q8 = y[ibl].qs;
10828
12404
  uint16_t sh = x[ibl].scales_h;
10829
12405
  __m256i sumi1 = __lasx_xvldi(0);
10830
12406
  __m256i sumi2 = __lasx_xvldi(0);
10831
- __m128i zero = __lsx_vldi(0);
10832
12407
  for (int ib = 0; ib < QK_K/32; ib += 2) {
10833
- const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
10834
- const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
12408
+ const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
12409
+ const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
10835
12410
  const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
10836
12411
  const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
10837
- tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f);
10838
- tmp0 = __lsx_vori_b(tmp2, 0x10);
10839
- mask = __lsx_vsle_b(zero, tmp2);
10840
- tmp3 = __lsx_vand_v(tmp0, mask);
10841
- tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
10842
-
10843
- tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f);
10844
- tmp0 = __lsx_vori_b(tmp2, 0x10);
10845
- mask = __lsx_vsle_b(zero, tmp2);
10846
- tmp4 = __lsx_vand_v(tmp0, mask);
10847
- tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
10848
-
10849
- const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4);
10850
-
10851
- tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f);
10852
- tmp0 = __lsx_vori_b(tmp2, 0x10);
10853
- mask = __lsx_vsle_b(zero, tmp2);
10854
- tmp3 = __lsx_vand_v(tmp0, mask);
10855
- tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
10856
-
10857
- tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f);
10858
- tmp0 = __lsx_vori_b(tmp2, 0x10);
10859
- mask = __lsx_vsle_b(zero, tmp2);
10860
- tmp4 = __lsx_vand_v(tmp0, mask);
10861
- tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
10862
-
10863
- const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4);
10864
-
12412
+ const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)),
12413
+ __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf)));
12414
+ const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)),
12415
+ __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf)));
10865
12416
  const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10866
12417
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10867
12418
  const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
10868
12419
  const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
10869
12420
  sh >>= 4;
10870
- __m256i tmp5, tmp6;
10871
- tmp1 = __lasx_xvreplgr2vr_h(ls1);
10872
- tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1);
10873
- tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1);
10874
- const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6);
10875
- tmp1 = __lasx_xvreplgr2vr_h(ls2);
10876
- tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1);
10877
- tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1);
10878
- const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6);
12421
+ const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1));
12422
+ const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2));
10879
12423
  sumi1 = __lasx_xvadd_w(p_1, sumi1);
10880
12424
  sumi2 = __lasx_xvadd_w(p_2, sumi2);
10881
12425
  }
@@ -10884,6 +12428,56 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10884
12428
  }
10885
12429
 
10886
12430
  *s = hsum_float_8(accum);
12431
+ #elif defined(__VXE__) || defined(__VXE2__)
12432
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
12433
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
12434
+
12435
+ float sumf = 0;
12436
+
12437
+ for (int ibl = 0; ibl < nb; ++ibl) {
12438
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12439
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
12440
+
12441
+ uint16_t h = x[ibl].scales_h;
12442
+
12443
+ int sumi1 = 0, sumi2 = 0;
12444
+ for (int ib = 0; ib < QK_K/64; ++ib) {
12445
+ const uint8x16_t v_x0 = vec_xl(0 , q4);
12446
+ const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
12447
+ q4 += 32;
12448
+
12449
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
12450
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
12451
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
12452
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
12453
+
12454
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
12455
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
12456
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
12457
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
12458
+
12459
+ const int8x16_t v_y0 = vec_xl( 0, q8);
12460
+ const int8x16_t v_y1 = vec_xl(16, q8);
12461
+ const int8x16_t v_y2 = vec_xl(32, q8);
12462
+ const int8x16_t v_y3 = vec_xl(48, q8);
12463
+ q8 += 64;
12464
+
12465
+ int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
12466
+ int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
12467
+
12468
+ int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
12469
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
12470
+
12471
+ h >>= 4;
12472
+
12473
+ sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
12474
+ sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
12475
+ }
12476
+
12477
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
12478
+ }
12479
+
12480
+ *s = sumf;
10887
12481
 
10888
12482
  #else
10889
12483
  float sumf = 0;
@@ -10922,12 +12516,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10922
12516
 
10923
12517
  // ============================ 4-bit non-linear quants
10924
12518
 
10925
- void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
12519
+ void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
10926
12520
  assert(k % QK4_NL == 0);
10927
12521
  quantize_row_iq4_nl_ref(x, y, k);
10928
12522
  }
10929
12523
 
10930
- void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
12524
+ void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
10931
12525
  assert(k % QK_K == 0);
10932
12526
  quantize_iq4_xs(x, y, 1, k, NULL);
10933
12527
  }