@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -719,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
719
719
  }
720
720
  #endif //__loongarch_asx
721
721
 
722
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
722
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
723
723
  quantize_row_q4_0_ref(x, y, k);
724
724
  }
725
725
 
726
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
726
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
727
727
  quantize_row_q4_1_ref(x, y, k);
728
728
  }
729
729
 
730
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
730
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
731
731
  quantize_row_q5_0_ref(x, y, k);
732
732
  }
733
733
 
734
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
734
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
735
735
  quantize_row_q5_1_ref(x, y, k);
736
736
  }
737
737
 
738
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
738
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
739
739
  assert(QK8_0 == 32);
740
740
  assert(k % QK8_0 == 0);
741
741
  const int nb = k / QK8_0;
742
742
 
743
- block_q8_0 * restrict y = vy;
743
+ block_q8_0 * GGML_RESTRICT y = vy;
744
744
 
745
745
  #if defined(__ARM_NEON)
746
746
  for (int i = 0; i < nb; i++) {
@@ -1011,6 +1011,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1011
1011
  __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
1012
1012
 
1013
1013
  }
1014
+ #elif defined(__VXE__) || defined(__VXE2__)
1015
+ for (int i = 0; i < nb; i++) {
1016
+ __vector float srcv [8];
1017
+ __vector float asrcv[8];
1018
+ __vector float amaxv[8];
1019
+
1020
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
1021
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
1022
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
1023
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
1024
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
1025
+
1026
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
1027
+ vec_extract(amaxv[0], 1)),
1028
+ MAX(vec_extract(amaxv[0], 2),
1029
+ vec_extract(amaxv[0], 3)));
1030
+
1031
+ const float d = amax / ((1 << 7) - 1);
1032
+ const float id = d ? 1.0f / d : 0.0f;
1033
+
1034
+ y[i].d = GGML_FP32_TO_FP16(d);
1035
+
1036
+ for (int j = 0; j < 8; j++) {
1037
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
1038
+ const __vector int32_t vi = vec_signed(v);
1039
+
1040
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
1041
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
1042
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
1043
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
1044
+ }
1045
+ }
1014
1046
  #else
1015
1047
  GGML_UNUSED(nb);
1016
1048
  // scalar
@@ -1018,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1018
1050
  #endif
1019
1051
  }
1020
1052
 
1021
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
1053
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1022
1054
  assert(k % QK8_1 == 0);
1023
1055
  const int nb = k / QK8_1;
1024
1056
 
1025
- block_q8_1 * restrict y = vy;
1057
+ block_q8_1 * GGML_RESTRICT y = vy;
1026
1058
 
1027
1059
  #if defined(__ARM_NEON)
1028
1060
  for (int i = 0; i < nb; i++) {
@@ -1337,6 +1369,44 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1337
1369
  __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
1338
1370
  __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
1339
1371
  }
1372
+ #elif defined(__VXE__) || defined(__VXE2__)
1373
+ for (int i = 0; i < nb; i++) {
1374
+ __vector float srcv [8];
1375
+ __vector float asrcv[8];
1376
+ __vector float amaxv[8];
1377
+
1378
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
1379
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
1380
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
1381
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
1382
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
1383
+
1384
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
1385
+ vec_extract(amaxv[0], 1)),
1386
+ MAX(vec_extract(amaxv[0], 2),
1387
+ vec_extract(amaxv[0], 3)));
1388
+
1389
+ const float d = amax / ((1 << 7) - 1);
1390
+ const float id = d ? 1.0f / d : 0.0f;
1391
+
1392
+ y[i].d = GGML_FP32_TO_FP16(d);
1393
+
1394
+ __vector int32_t acc = vec_splats(0);
1395
+
1396
+ for (int j = 0; j < 8; j++) {
1397
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
1398
+ const __vector int32_t vi = vec_signed(v);
1399
+
1400
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
1401
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
1402
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
1403
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
1404
+
1405
+ acc = vec_add(acc, vi);
1406
+ }
1407
+
1408
+ y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
1409
+ }
1340
1410
  #else
1341
1411
  GGML_UNUSED(nb);
1342
1412
  // scalar
@@ -1358,8 +1428,8 @@ static inline int nearest_int(float fval) {
1358
1428
  return (i & 0x007fffff) - 0x00400000;
1359
1429
  }
1360
1430
 
1361
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1362
- const float * restrict qw) {
1431
+ static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
1432
+ const float * GGML_RESTRICT qw) {
1363
1433
  float max = 0;
1364
1434
  float amax = 0;
1365
1435
  for (int i = 0; i < n; ++i) {
@@ -1427,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1427
1497
  return scale;
1428
1498
  }
1429
1499
 
1430
- static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
1500
+ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
1431
1501
  float max = 0;
1432
1502
  float amax = 0;
1433
1503
  for (int i = 0; i < n; ++i) {
@@ -1486,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
1486
1556
  return 1/iscale;
1487
1557
  }
1488
1558
 
1489
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
1559
+ static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
1490
1560
  int ntry, float alpha) {
1491
1561
  float min = x[0];
1492
1562
  float max = x[0];
@@ -1529,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
1529
1599
  return scale;
1530
1600
  }
1531
1601
 
1532
- static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1533
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1602
+ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
1603
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
1534
1604
  float rmin, float rdelta, int nstep, bool use_mad) {
1535
1605
  float min = x[0];
1536
1606
  float max = x[0];
@@ -1610,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1610
1680
  return scale;
1611
1681
  }
1612
1682
 
1613
- static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
1683
+ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
1614
1684
  if (j < 4) {
1615
1685
  *d = q[j] & 63; *m = q[j + 4] & 63;
1616
1686
  } else {
@@ -1621,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1621
1691
 
1622
1692
  //========================- 2-bit (de)-quantization
1623
1693
 
1624
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1694
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1625
1695
  quantize_row_q2_K_ref(x, vy, k);
1626
1696
  }
1627
1697
 
1628
1698
  //========================= 3-bit (de)-quantization
1629
1699
 
1630
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
1700
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1631
1701
  quantize_row_q3_K_ref(x, vy, k);
1632
1702
  }
1633
1703
 
1634
1704
  // ====================== 4-bit (de)-quantization
1635
1705
 
1636
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
1706
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1637
1707
  assert(k % QK_K == 0);
1638
- block_q4_K * restrict y = vy;
1708
+ block_q4_K * GGML_RESTRICT y = vy;
1639
1709
  quantize_row_q4_K_ref(x, y, k);
1640
1710
  }
1641
1711
 
1642
1712
  // ====================== 5-bit (de)-quantization
1643
1713
 
1644
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
1714
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1645
1715
  assert(k % QK_K == 0);
1646
- block_q5_K * restrict y = vy;
1716
+ block_q5_K * GGML_RESTRICT y = vy;
1647
1717
  quantize_row_q5_K_ref(x, y, k);
1648
1718
  }
1649
1719
 
1650
1720
  // ====================== 6-bit (de)-quantization
1651
1721
 
1652
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
1722
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1653
1723
  assert(k % QK_K == 0);
1654
- block_q6_K * restrict y = vy;
1724
+ block_q6_K * GGML_RESTRICT y = vy;
1655
1725
  quantize_row_q6_K_ref(x, y, k);
1656
1726
  }
1657
1727
 
1658
1728
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
1659
1729
 
1660
- void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
1730
+ void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1661
1731
  assert(k % QK_K == 0);
1662
- block_tq1_0 * restrict y = vy;
1732
+ block_tq1_0 * GGML_RESTRICT y = vy;
1663
1733
  quantize_row_tq1_0_ref(x, y, k);
1664
1734
  }
1665
1735
 
1666
- void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
1736
+ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1667
1737
  assert(k % QK_K == 0);
1668
- block_tq2_0 * restrict y = vy;
1738
+ block_tq2_0 * GGML_RESTRICT y = vy;
1669
1739
  quantize_row_tq2_0_ref(x, y, k);
1670
1740
  }
1671
1741
 
@@ -1673,11 +1743,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
1673
1743
 
1674
1744
  //===================================== Q8_K ==============================================
1675
1745
 
1676
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
1746
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1677
1747
  #ifdef __wasm_simd128__
1678
1748
  assert(k % QK_K == 0);
1679
1749
  const int64_t nb = k / QK_K;
1680
- block_q8_K * restrict yc = y; // Cast to proper type
1750
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
1681
1751
 
1682
1752
  for (int i = 0; i < nb; i++) {
1683
1753
  const float * x_block = x + i * QK_K;
@@ -1839,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
1839
1909
  }
1840
1910
  #endif
1841
1911
 
1842
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
1912
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1843
1913
  const int qk = QK8_0;
1844
1914
  const int nb = n / qk;
1845
1915
 
@@ -1854,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1854
1924
  UNUSED(by);
1855
1925
  UNUSED(bs);
1856
1926
 
1857
- const block_q4_0 * restrict x = vx;
1858
- const block_q8_0 * restrict y = vy;
1927
+ const block_q4_0 * GGML_RESTRICT x = vx;
1928
+ const block_q8_0 * GGML_RESTRICT y = vy;
1859
1929
 
1860
1930
  #if defined(__ARM_FEATURE_MATMUL_INT8)
1861
1931
  if (nrc == 2) {
1862
- const block_q4_0 * restrict vx0 = vx;
1863
- const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1864
- const block_q8_0 * restrict vy0 = vy;
1865
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1932
+ const block_q4_0 * GGML_RESTRICT vx0 = vx;
1933
+ const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1934
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
1935
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1866
1936
 
1867
1937
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
1868
1938
 
1869
1939
  for (int i = 0; i < nb; i++) {
1870
- const block_q4_0 * restrict b_x0 = &vx0[i];
1871
- const block_q4_0 * restrict b_x1 = &vx1[i];
1872
- const block_q8_0 * restrict b_y0 = &vy0[i];
1873
- const block_q8_0 * restrict b_y1 = &vy1[i];
1940
+ const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
1941
+ const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
1942
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
1943
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
1874
1944
 
1875
1945
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
1876
1946
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -1947,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1947
2017
  const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
1948
2018
 
1949
2019
  for (; ib + 1 < nb; ib += 2) {
1950
- const block_q4_0 * restrict x0 = &x[ib + 0];
1951
- const block_q4_0 * restrict x1 = &x[ib + 1];
1952
- const block_q8_0 * restrict y0 = &y[ib + 0];
1953
- const block_q8_0 * restrict y1 = &y[ib + 1];
2020
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2021
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2022
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2023
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
1954
2024
 
1955
2025
  // load x
1956
2026
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -1993,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1993
2063
  const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
1994
2064
 
1995
2065
  for (; ib + 1 < nb; ib += 2) {
1996
- const block_q4_0 * restrict x0 = &x[ib + 0];
1997
- const block_q4_0 * restrict x1 = &x[ib + 1];
1998
- const block_q8_0 * restrict y0 = &y[ib + 0];
1999
- const block_q8_0 * restrict y1 = &y[ib + 1];
2066
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2067
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2068
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2069
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2000
2070
 
2001
2071
  // load x
2002
2072
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -2034,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2034
2104
  const svbool_t pl16 = svnot_b_z(ph32, ph16);
2035
2105
 
2036
2106
  for (; ib + 1 < nb; ib += 2) {
2037
- const block_q4_0 * restrict x0 = &x[ib + 0];
2038
- const block_q4_0 * restrict x1 = &x[ib + 1];
2039
- const block_q8_0 * restrict y0 = &y[ib + 0];
2040
- const block_q8_0 * restrict y1 = &y[ib + 1];
2107
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2108
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2109
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2110
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2041
2111
 
2042
2112
  // load x
2043
2113
  const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
@@ -2074,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2074
2144
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2075
2145
 
2076
2146
  for (; ib + 1 < nb; ib += 2) {
2077
- const block_q4_0 * restrict x0 = &x[ib + 0];
2078
- const block_q4_0 * restrict x1 = &x[ib + 1];
2079
- const block_q8_0 * restrict y0 = &y[ib + 0];
2080
- const block_q8_0 * restrict y1 = &y[ib + 1];
2147
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2148
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2149
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2150
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2081
2151
 
2082
2152
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2083
2153
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -2119,10 +2189,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2119
2189
  const v128_t s8b = wasm_i8x16_splat(0x8);
2120
2190
 
2121
2191
  for (; ib + 1 < nb; ib += 2) {
2122
- const block_q4_0 * restrict x0 = &x[ib];
2123
- const block_q4_0 * restrict x1 = &x[ib + 1];
2124
- const block_q8_0 * restrict y0 = &y[ib];
2125
- const block_q8_0 * restrict y1 = &y[ib + 1];
2192
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
2193
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2194
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2195
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2126
2196
 
2127
2197
  // Load and process x0
2128
2198
  v128_t v0_0 = wasm_v128_load(x0->qs);
@@ -2488,6 +2558,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2488
2558
  }
2489
2559
 
2490
2560
  sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2561
+ #elif defined(__VXE__) || defined(__VXE2__)
2562
+ __vector float acc = vec_splats(0.0f);
2563
+
2564
+ const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
2565
+ const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
2566
+
2567
+ for (; ib < nb; ++ib) {
2568
+ const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
2569
+ const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
2570
+ const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
2571
+
2572
+ const __vector int8_t v_xls = vec_sub(v_xl, v_s);
2573
+ const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
2574
+
2575
+ const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
2576
+ const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
2577
+
2578
+ const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
2579
+ const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
2580
+ const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
2581
+ const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
2582
+
2583
+ __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
2584
+
2585
+ const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
2586
+ const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
2587
+
2588
+ acc = vec_madd(v_xy, v_d, acc);
2589
+ }
2590
+
2591
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
2491
2592
  #endif
2492
2593
  for (; ib < nb; ++ib) {
2493
2594
  int sumi0 = 0;
@@ -2508,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2508
2609
  *s = sumf;
2509
2610
  }
2510
2611
 
2511
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2612
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2512
2613
  const int qk = QK8_1;
2513
2614
  const int nb = n / qk;
2514
2615
 
@@ -2523,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2523
2624
  UNUSED(by);
2524
2625
  UNUSED(bs);
2525
2626
 
2526
- const block_q4_1 * restrict x = vx;
2527
- const block_q8_1 * restrict y = vy;
2627
+ const block_q4_1 * GGML_RESTRICT x = vx;
2628
+ const block_q8_1 * GGML_RESTRICT y = vy;
2528
2629
 
2529
2630
  #if defined(__ARM_FEATURE_MATMUL_INT8)
2530
2631
  if (nrc == 2) {
2531
- const block_q4_1 * restrict vx0 = vx;
2532
- const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2533
- const block_q8_1 * restrict vy0 = vy;
2534
- const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2632
+ const block_q4_1 * GGML_RESTRICT vx0 = vx;
2633
+ const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2634
+ const block_q8_1 * GGML_RESTRICT vy0 = vy;
2635
+ const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2535
2636
 
2536
2637
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2537
2638
  float32x4_t summs0 = vdupq_n_f32(0.0f);
2538
2639
 
2539
2640
  for (int i = 0; i < nb; i++) {
2540
- const block_q4_1 * restrict b_x0 = &vx0[i];
2541
- const block_q4_1 * restrict b_x1 = &vx1[i];
2542
- const block_q8_1 * restrict b_y0 = &vy0[i];
2543
- const block_q8_1 * restrict b_y1 = &vy1[i];
2641
+ const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
2642
+ const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
2643
+ const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
2644
+ const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
2544
2645
 
2545
2646
  float32_t summs_t[4] = {
2546
2647
  GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
@@ -2614,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2614
2715
  float summs = 0;
2615
2716
 
2616
2717
  for (; ib + 1 < nb; ib += 2) {
2617
- const block_q4_1 * restrict x0 = &x[ib + 0];
2618
- const block_q4_1 * restrict x1 = &x[ib + 1];
2619
- const block_q8_1 * restrict y0 = &y[ib + 0];
2620
- const block_q8_1 * restrict y1 = &y[ib + 1];
2718
+ const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
2719
+ const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
2720
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
2721
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
2621
2722
 
2622
2723
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
2623
2724
 
@@ -2781,6 +2882,35 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2781
2882
  }
2782
2883
 
2783
2884
  sumf = hsum_float_8(acc) + summs;
2885
+ #elif defined(__VXE__) || defined(__VXE2__)
2886
+ float summs = 0;
2887
+ float32x4_t acc = vec_splats(0.0f);
2888
+
2889
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
2890
+
2891
+ #pragma GCC unroll 4
2892
+ for (; ib < nb; ++ib) {
2893
+ __builtin_prefetch(x[ib].qs, 0, 1);
2894
+ __builtin_prefetch(y[ib].qs, 0, 1);
2895
+
2896
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
2897
+
2898
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
2899
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
2900
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
2901
+
2902
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
2903
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
2904
+
2905
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
2906
+ const float32x4_t v_xy = vec_float(v_xy_);
2907
+
2908
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
2909
+
2910
+ acc = vec_madd(v_xy, v_d, acc);
2911
+ }
2912
+
2913
+ sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
2784
2914
  #endif
2785
2915
  for (; ib < nb; ++ib) {
2786
2916
  int sumi0 = 0;
@@ -2801,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2801
2931
  *s = sumf;
2802
2932
  }
2803
2933
 
2804
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2934
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2805
2935
  const int qk = QK8_0;
2806
2936
  const int nb = n / qk;
2807
2937
 
@@ -2816,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2816
2946
  UNUSED(by);
2817
2947
  UNUSED(bs);
2818
2948
 
2819
- const block_q5_0 * restrict x = vx;
2820
- const block_q8_0 * restrict y = vy;
2949
+ const block_q5_0 * GGML_RESTRICT x = vx;
2950
+ const block_q8_0 * GGML_RESTRICT y = vy;
2821
2951
 
2822
2952
  #if defined(__ARM_NEON)
2823
2953
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2830,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2830
2960
  uint64_t tmp1[4];
2831
2961
 
2832
2962
  for (; ib + 1 < nb; ib += 2) {
2833
- const block_q5_0 * restrict x0 = &x[ib];
2834
- const block_q5_0 * restrict x1 = &x[ib + 1];
2835
- const block_q8_0 * restrict y0 = &y[ib];
2836
- const block_q8_0 * restrict y1 = &y[ib + 1];
2963
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
2964
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
2965
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2966
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2837
2967
 
2838
2968
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2839
2969
 
@@ -2894,8 +3024,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2894
3024
 
2895
3025
  // TODO: check if unrolling this is better
2896
3026
  for (; ib < nb; ++ib) {
2897
- const block_q5_0 * restrict x0 = &x[ib];
2898
- const block_q8_0 * restrict y0 = &y[ib];
3027
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
3028
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2899
3029
 
2900
3030
  const v128_t m4b = wasm_i8x16_splat(0x0F);
2901
3031
 
@@ -3156,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3156
3286
  *s = sumf;
3157
3287
  }
3158
3288
 
3159
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3289
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3160
3290
  const int qk = QK8_1;
3161
3291
  const int nb = n / qk;
3162
3292
 
@@ -3171,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3171
3301
  UNUSED(by);
3172
3302
  UNUSED(bs);
3173
3303
 
3174
- const block_q5_1 * restrict x = vx;
3175
- const block_q8_1 * restrict y = vy;
3304
+ const block_q5_1 * GGML_RESTRICT x = vx;
3305
+ const block_q8_1 * GGML_RESTRICT y = vy;
3176
3306
 
3177
3307
  #if defined(__ARM_NEON)
3178
3308
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3188,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3188
3318
  uint64_t tmp1[4];
3189
3319
 
3190
3320
  for (; ib + 1 < nb; ib += 2) {
3191
- const block_q5_1 * restrict x0 = &x[ib];
3192
- const block_q5_1 * restrict x1 = &x[ib + 1];
3193
- const block_q8_1 * restrict y0 = &y[ib];
3194
- const block_q8_1 * restrict y1 = &y[ib + 1];
3321
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3322
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
3323
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3324
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
3195
3325
 
3196
3326
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3197
3327
 
@@ -3257,8 +3387,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3257
3387
 
3258
3388
  // TODO: check if unrolling this is better
3259
3389
  for (; ib < nb; ++ib) {
3260
- const block_q5_1 * restrict x0 = &x[ib];
3261
- const block_q8_1 * restrict y0 = &y[ib];
3390
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3391
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3262
3392
 
3263
3393
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
3264
3394
 
@@ -3530,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3530
3660
  *s = sumf;
3531
3661
  }
3532
3662
 
3533
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3663
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3534
3664
  const int qk = QK8_0;
3535
3665
  const int nb = n / qk;
3536
3666
 
@@ -3545,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3545
3675
  UNUSED(by);
3546
3676
  UNUSED(bs);
3547
3677
 
3548
- const block_q8_0 * restrict x = vx;
3549
- const block_q8_0 * restrict y = vy;
3678
+ const block_q8_0 * GGML_RESTRICT x = vx;
3679
+ const block_q8_0 * GGML_RESTRICT y = vy;
3550
3680
 
3551
3681
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3552
3682
  if (nrc == 2) {
3553
- const block_q8_0 * restrict vx0 = vx;
3554
- const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3555
- const block_q8_0 * restrict vy0 = vy;
3556
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3683
+ const block_q8_0 * GGML_RESTRICT vx0 = vx;
3684
+ const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3685
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
3686
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3557
3687
 
3558
3688
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3559
3689
 
3560
3690
  for (int i = 0; i < nb; i++) {
3561
- const block_q8_0 * restrict b_x0 = &vx0[i];
3562
- const block_q8_0 * restrict b_y0 = &vy0[i];
3691
+ const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
3692
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
3563
3693
 
3564
- const block_q8_0 * restrict b_x1 = &vx1[i];
3565
- const block_q8_0 * restrict b_y1 = &vy1[i];
3694
+ const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
3695
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
3566
3696
 
3567
3697
  const int8x16_t x0_l = vld1q_s8(b_x0->qs);
3568
3698
  const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
@@ -3627,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3627
3757
  const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
3628
3758
 
3629
3759
  for (; ib + 1 < nb; ib += 2) {
3630
- const block_q8_0 * restrict x0 = &x[ib + 0];
3631
- const block_q8_0 * restrict x1 = &x[ib + 1];
3632
- const block_q8_0 * restrict y0 = &y[ib + 0];
3633
- const block_q8_0 * restrict y1 = &y[ib + 1];
3760
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3761
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3762
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3763
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3634
3764
 
3635
3765
  // load x
3636
3766
  const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
@@ -3658,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3658
3788
  {
3659
3789
  //printf("sve256");
3660
3790
  for (; ib + 1 < nb; ib += 2) {
3661
- const block_q8_0 * restrict x0 = &x[ib + 0];
3662
- const block_q8_0 * restrict x1 = &x[ib + 1];
3663
- const block_q8_0 * restrict y0 = &y[ib + 0];
3664
- const block_q8_0 * restrict y1 = &y[ib + 1];
3791
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3792
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3793
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3794
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3665
3795
 
3666
3796
  // load x
3667
3797
  const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
@@ -3694,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3694
3824
  svfloat32_t sumv00 = svdup_n_f32(0.0f);
3695
3825
 
3696
3826
  for (; ib + 1 < nb; ib += 2) {
3697
- const block_q8_0 * restrict x0 = &x[ib + 0];
3698
- const block_q8_0 * restrict x1 = &x[ib + 1];
3699
- const block_q8_0 * restrict y0 = &y[ib + 0];
3700
- const block_q8_0 * restrict y1 = &y[ib + 1];
3827
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3828
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3829
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3830
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3701
3831
 
3702
3832
  //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
3703
3833
  // and add them to make one 64 element vector
@@ -3737,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3737
3867
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3738
3868
 
3739
3869
  for (; ib + 1 < nb; ib += 2) {
3740
- const block_q8_0 * restrict x0 = &x[ib + 0];
3741
- const block_q8_0 * restrict x1 = &x[ib + 1];
3742
- const block_q8_0 * restrict y0 = &y[ib + 0];
3743
- const block_q8_0 * restrict y1 = &y[ib + 1];
3870
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3871
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3872
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3873
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3744
3874
 
3745
3875
  const int8x16_t x0_0 = vld1q_s8(x0->qs);
3746
3876
  const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3767,8 +3897,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3767
3897
  v128_t sumv = wasm_f32x4_splat(0.0f);
3768
3898
 
3769
3899
  for (; ib < nb; ++ib) {
3770
- const block_q8_0 * restrict x0 = &x[ib];
3771
- const block_q8_0 * restrict y0 = &y[ib];
3900
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
3901
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
3772
3902
 
3773
3903
  const v128_t x0_0 = wasm_v128_load(x0->qs);
3774
3904
  const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
@@ -3915,6 +4045,27 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3915
4045
  }
3916
4046
 
3917
4047
  sumf = hsum_float_8(acc);
4048
+ #elif defined(__VXE__) || defined(__VXE2__)
4049
+ __vector float acc = vec_splats(0.0f);
4050
+
4051
+ #pragma GCC unroll 8
4052
+ for (; ib < nb; ++ib) {
4053
+ __builtin_prefetch(x[ib].qs, 0, 1);
4054
+ __builtin_prefetch(y[ib].qs, 0, 1);
4055
+
4056
+ const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
4057
+ const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
4058
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
4059
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
4060
+
4061
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
4062
+ const float32x4_t v_xy = vec_float(v_xy_);
4063
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
4064
+
4065
+ acc = vec_madd(v_xy, v_d, acc);
4066
+ }
4067
+
4068
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
3918
4069
  #endif
3919
4070
  for (; ib < nb; ++ib) {
3920
4071
  int sumi = 0;
@@ -3929,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3929
4080
  *s = sumf;
3930
4081
  }
3931
4082
 
3932
- void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4083
+ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3933
4084
  assert(nrc == 1);
3934
4085
  UNUSED(nrc);
3935
4086
  UNUSED(bx);
3936
4087
  UNUSED(by);
3937
4088
  UNUSED(bs);
3938
4089
 
3939
- const block_tq1_0 * restrict x = vx;
3940
- const block_q8_K * restrict y = vy;
4090
+ const block_tq1_0 * GGML_RESTRICT x = vx;
4091
+ const block_q8_K * GGML_RESTRICT y = vy;
3941
4092
 
3942
4093
  const int nb = n / QK_K;
3943
4094
 
@@ -4252,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
4252
4403
  #endif
4253
4404
  }
4254
4405
 
4255
- void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4406
+ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4256
4407
  assert(nrc == 1);
4257
4408
  UNUSED(nrc);
4258
4409
  UNUSED(bx);
4259
4410
  UNUSED(by);
4260
4411
  UNUSED(bs);
4261
4412
 
4262
- const block_tq2_0 * restrict x = vx;
4263
- const block_q8_K * restrict y = vy;
4413
+ const block_tq2_0 * GGML_RESTRICT x = vx;
4414
+ const block_q8_K * GGML_RESTRICT y = vy;
4264
4415
 
4265
4416
  const int nb = n / QK_K;
4266
4417
 
@@ -4424,19 +4575,264 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
4424
4575
  #endif
4425
4576
  }
4426
4577
 
4427
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4578
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4428
4579
  assert(nrc == 1);
4429
4580
  UNUSED(nrc);
4430
4581
  UNUSED(bx);
4431
4582
  UNUSED(by);
4432
4583
  UNUSED(bs);
4433
4584
 
4434
- const block_q2_K * restrict x = vx;
4435
- const block_q8_K * restrict y = vy;
4585
+ const block_q2_K * GGML_RESTRICT x = vx;
4586
+ const block_q8_K * GGML_RESTRICT y = vy;
4436
4587
 
4437
4588
  const int nb = n / QK_K;
4438
4589
 
4439
- #ifdef __ARM_NEON
4590
+ #ifdef __ARM_FEATURE_SVE
4591
+ const int vector_length = svcntb()*8;
4592
+ const svuint8_t m3s = svdup_n_u8(0x3);
4593
+ const svuint32_t m4s = svdup_n_u32(0xF);
4594
+ const svint32_t vzero_sv = svdup_n_s32(0);
4595
+ svfloat32_t acc_sum = svdup_n_f32(0);
4596
+ svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
4597
+
4598
+ switch (vector_length) {
4599
+ case 128:
4600
+ for (int i = 0; i < nb; ++i) {
4601
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4602
+ svfloat32_t d_broad = svdup_n_f32((float32_t)d);
4603
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4604
+ svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4605
+
4606
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4607
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4608
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4609
+
4610
+ svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
4611
+ const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4612
+
4613
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
4614
+ const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4615
+
4616
+ svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
4617
+ svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
4618
+
4619
+ const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
4620
+
4621
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
4622
+ const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4623
+
4624
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
4625
+ const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4626
+
4627
+ q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
4628
+ q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
4629
+
4630
+ svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
4631
+
4632
+ svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
4633
+
4634
+ acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
4635
+
4636
+ svint32_t sumi1 = svdup_n_s32(0);
4637
+
4638
+ {
4639
+ const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
4640
+ svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
4641
+ svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4642
+ const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
4643
+
4644
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
4645
+
4646
+ const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
4647
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
4648
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4649
+
4650
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
4651
+
4652
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
4653
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4654
+
4655
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
4656
+
4657
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
4658
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4659
+
4660
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
4661
+
4662
+
4663
+ const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
4664
+
4665
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
4666
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4667
+
4668
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
4669
+
4670
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
4671
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4672
+
4673
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
4674
+
4675
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
4676
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4677
+
4678
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
4679
+
4680
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
4681
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4682
+
4683
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
4684
+
4685
+ //-------------------------------
4686
+
4687
+ q2 += 32;
4688
+ const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
4689
+ const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
4690
+
4691
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
4692
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4693
+
4694
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
4695
+
4696
+ const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
4697
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
4698
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4699
+
4700
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
4701
+
4702
+
4703
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
4704
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4705
+
4706
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
4707
+
4708
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
4709
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4710
+
4711
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
4712
+
4713
+
4714
+ const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
4715
+
4716
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
4717
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4718
+
4719
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
4720
+
4721
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
4722
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4723
+
4724
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
4725
+
4726
+
4727
+
4728
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
4729
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4730
+
4731
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
4732
+
4733
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
4734
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4735
+
4736
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
4737
+ }
4738
+ acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
4739
+ }
4740
+ *s = svaddv_f32(svptrue_b32(), acc_sum);
4741
+ break;
4742
+
4743
+ case 256:
4744
+ case 512:
4745
+ for (int i = 0; i < nb; ++i) {
4746
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4747
+ svfloat32_t d_broad = svdup_n_f32((float32_t)d);
4748
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4749
+ svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4750
+
4751
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4752
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4753
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4754
+
4755
+ const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
4756
+ const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
4757
+ const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
4758
+ svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
4759
+
4760
+ const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
4761
+ const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
4762
+ const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
4763
+
4764
+ svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
4765
+
4766
+ svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
4767
+
4768
+ acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
4769
+
4770
+ svint32_t sumi1 = svdup_n_s32(0);
4771
+
4772
+ {
4773
+ const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
4774
+ svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
4775
+ svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4776
+
4777
+ svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
4778
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4779
+
4780
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
4781
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4782
+
4783
+ svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
4784
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
4785
+
4786
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
4787
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4788
+
4789
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
4790
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4791
+
4792
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
4793
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4794
+
4795
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
4796
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4797
+
4798
+ q2 += 32;
4799
+
4800
+ const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
4801
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
4802
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4803
+
4804
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
4805
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4806
+
4807
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
4808
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4809
+
4810
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
4811
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4812
+
4813
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
4814
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4815
+
4816
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
4817
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4818
+
4819
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
4820
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4821
+
4822
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
4823
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4824
+ }
4825
+ acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
4826
+ }
4827
+ *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
4828
+ break;
4829
+
4830
+ default:
4831
+ assert(false && "Unsupported vector length");
4832
+ break;
4833
+ }
4834
+
4835
+ #elif __ARM_NEON
4440
4836
  const uint8x16_t m3 = vdupq_n_u8(0x3);
4441
4837
  const uint8x16_t m4 = vdupq_n_u8(0xF);
4442
4838
 
@@ -4451,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4451
4847
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4452
4848
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4453
4849
 
4454
- const uint8_t * restrict q2 = x[i].qs;
4455
- const int8_t * restrict q8 = y[i].qs;
4456
- const uint8_t * restrict sc = x[i].scales;
4850
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4851
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4852
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4457
4853
 
4458
4854
  const uint8x16_t mins_and_scales = vld1q_u8(sc);
4459
4855
  const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -4516,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4516
4912
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4517
4913
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4518
4914
 
4519
- const uint8_t * restrict q2 = x[i].qs;
4520
- const int8_t * restrict q8 = y[i].qs;
4915
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4916
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4521
4917
 
4522
4918
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
4523
4919
  const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -4583,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4583
4979
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4584
4980
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4585
4981
 
4586
- const uint8_t * restrict q2 = x[i].qs;
4587
- const int8_t * restrict q8 = y[i].qs;
4982
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4983
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4588
4984
 
4589
4985
  // load mins and scales from block_q2_K.scales[QK_K/16]
4590
4986
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -4910,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4910
5306
  vector signed int vsumi6 = v0;
4911
5307
  vector signed int vsumi7 = v0;
4912
5308
 
4913
- const uint8_t * restrict q2 = x[i].qs;
4914
- const int8_t * restrict q8 = y[i].qs;
5309
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5310
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4915
5311
 
4916
5312
  for (int j = 0; j < QK_K/128; ++j) {
4917
5313
  __builtin_prefetch(q2, 0, 1);
@@ -5002,8 +5398,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5002
5398
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5003
5399
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5004
5400
 
5005
- const uint8_t * restrict q2 = x[i].qs;
5006
- const int8_t * restrict q8 = y[i].qs;
5401
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5402
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5007
5403
 
5008
5404
  const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
5009
5405
  const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
@@ -5096,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5096
5492
  #endif
5097
5493
  }
5098
5494
 
5099
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5495
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
5100
5496
  assert(n % QK_K == 0);
5101
5497
  assert(nrc == 1);
5102
5498
  UNUSED(nrc);
@@ -5107,12 +5503,187 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5107
5503
  const uint32_t kmask1 = 0x03030303;
5108
5504
  const uint32_t kmask2 = 0x0f0f0f0f;
5109
5505
 
5110
- const block_q3_K * restrict x = vx;
5111
- const block_q8_K * restrict y = vy;
5506
+ const block_q3_K * GGML_RESTRICT x = vx;
5507
+ const block_q8_K * GGML_RESTRICT y = vy;
5112
5508
 
5113
5509
  const int nb = n / QK_K;
5114
5510
 
5115
- #ifdef __ARM_NEON
5511
+ #if defined(__ARM_FEATURE_SVE)
5512
+
5513
+ uint32_t aux[3];
5514
+ uint32_t utmp[4];
5515
+
5516
+ const int8_t m32 = 32;
5517
+ const int vector_length = svcntb()*8;
5518
+ const svuint8_t m3b_sv = svdup_n_u8(0x3);
5519
+ const svint32_t vzero_sv = svdup_n_s32(0);
5520
+
5521
+ const svuint8_t m0_sv = svdup_n_u8(1);
5522
+ const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
5523
+ const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
5524
+ const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
5525
+
5526
+ float sum = 0;
5527
+
5528
+ for (int i = 0; i < nb; ++i) {
5529
+
5530
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5531
+
5532
+ const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
5533
+ const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
5534
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
5535
+
5536
+ // Set up scales
5537
+ memcpy(aux, x[i].scales, 12);
5538
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
5539
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
5540
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
5541
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
5542
+
5543
+ int8_t * scale = (int8_t *)utmp;
5544
+
5545
+ for (int j = 0; j < 16; ++j) scale[j] -= m32;
5546
+
5547
+ switch (vector_length) {
5548
+ case 128:
5549
+ {
5550
+ svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
5551
+ svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
5552
+ svuint8_t q3h_sv;
5553
+
5554
+ svint32_t sumi1_1 = svdup_n_s32(0);
5555
+ svint8_t q3bytes_sv;
5556
+
5557
+ for (int j = 0; j < QK_K/128; ++j) {
5558
+
5559
+ const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5560
+ const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5561
+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5562
+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5563
+
5564
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
5565
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5566
+
5567
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
5568
+
5569
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
5570
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5571
+
5572
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
5573
+
5574
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5575
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5576
+
5577
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
5578
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5579
+
5580
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
5581
+
5582
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
5583
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5584
+
5585
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
5586
+
5587
+
5588
+ scale += 4;
5589
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5590
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5591
+
5592
+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
5593
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5594
+
5595
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
5596
+
5597
+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
5598
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5599
+
5600
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
5601
+
5602
+
5603
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5604
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5605
+
5606
+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
5607
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5608
+
5609
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
5610
+
5611
+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
5612
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5613
+
5614
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
5615
+
5616
+ if (j == 0) {
5617
+ qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
5618
+ qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
5619
+ }
5620
+
5621
+ scale += 4;
5622
+ }
5623
+
5624
+ sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
5625
+ } break;
5626
+ case 256:
5627
+ case 512:
5628
+ {
5629
+ svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
5630
+ svuint8_t q3h_sv;
5631
+
5632
+ svint32_t sumi1_1 = svdup_n_s32(0);
5633
+ svint8_t q3bytes_sv;
5634
+
5635
+ for (int j = 0; j < QK_K/128; ++j) {
5636
+
5637
+ const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
5638
+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5639
+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5640
+
5641
+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
5642
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5643
+
5644
+
5645
+ svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5646
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
5647
+
5648
+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
5649
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5650
+
5651
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5652
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
5653
+
5654
+ scale += 4;
5655
+ q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5656
+ q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5657
+
5658
+ q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
5659
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5660
+
5661
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5662
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
5663
+
5664
+ q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
5665
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5666
+
5667
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5668
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
5669
+
5670
+ if (j == 0) {
5671
+ qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
5672
+ }
5673
+
5674
+ scale += 4;
5675
+ }
5676
+
5677
+ sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
5678
+ } break;
5679
+ default:
5680
+ assert(false && "Unsupported vector length");
5681
+ break;
5682
+ }
5683
+ }
5684
+ *s = sum;
5685
+
5686
+ #elif __ARM_NEON
5116
5687
 
5117
5688
  uint32_t aux[3];
5118
5689
  uint32_t utmp[4];
@@ -5134,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5134
5705
 
5135
5706
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5136
5707
 
5137
- const uint8_t * restrict q3 = x[i].qs;
5138
- const uint8_t * restrict qh = x[i].hmask;
5139
- const int8_t * restrict q8 = y[i].qs;
5708
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5709
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
5710
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5140
5711
 
5141
5712
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
5142
5713
 
@@ -5220,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5220
5791
 
5221
5792
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5222
5793
 
5223
- const uint8_t * restrict q3 = x[i].qs;
5224
- const int8_t * restrict q8 = y[i].qs;
5794
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5225
5796
 
5226
5797
  // Set up scales
5227
5798
  memcpy(aux, x[i].scales, 12);
@@ -5325,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5325
5896
 
5326
5897
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5327
5898
 
5328
- const uint8_t * restrict q3 = x[i].qs;
5329
- const int8_t * restrict q8 = y[i].qs;
5899
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5900
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5330
5901
 
5331
5902
  // Set up scales
5332
5903
  aux = (const uint32_t *)x[i].scales;
@@ -5459,9 +6030,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5459
6030
 
5460
6031
  float sumf = 0;
5461
6032
  for (int i = 0; i < nb; ++i) {
5462
- const uint8_t * restrict q3 = x[i].qs;
5463
- const uint8_t * restrict hm = x[i].hmask;
5464
- const int8_t * restrict q8 = y[i].qs;
6033
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6034
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6035
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5465
6036
 
5466
6037
  // Process blocks with SIMD
5467
6038
  int8_t * a = aux8;
@@ -5548,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5548
6119
  float sumf = 0;
5549
6120
  for (int i = 0; i < nb; ++i) {
5550
6121
 
5551
- const uint8_t * restrict q3 = x[i].qs;
5552
- const uint8_t * restrict qh = x[i].hmask;
5553
- const int8_t * restrict q8 = y[i].qs;
6122
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6123
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
6124
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5554
6125
 
5555
6126
  memcpy(aux, x[i].scales, 12);
5556
6127
  utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -5690,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5690
6261
  vector signed int vsumi6 = v0;
5691
6262
  vector signed int vsumi7 = v0;
5692
6263
 
5693
- const uint8_t * restrict q3 = x[i].qs;
5694
- const int8_t * restrict q8 = y[i].qs;
6264
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6265
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5695
6266
 
5696
6267
  for (int j = 0; j < QK_K/128; ++j) {
5697
6268
  __builtin_prefetch(q3, 0, 1);
@@ -5804,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5804
6375
  for (int i = 0; i < nb; ++i) {
5805
6376
 
5806
6377
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5807
- const uint8_t * restrict q3 = x[i].qs;
5808
- const int8_t * restrict q8 = y[i].qs;
6378
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5809
6380
  // Set up scales
5810
6381
  memcpy(aux, x[i].scales, 12);
5811
6382
  __m128i scales128 = lsx_set_w(
@@ -5890,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5890
6461
 
5891
6462
  float sumf = 0;
5892
6463
  for (int i = 0; i < nb; ++i) {
5893
- const uint8_t * restrict q3 = x[i].qs;
5894
- const uint8_t * restrict hm = x[i].hmask;
5895
- const int8_t * restrict q8 = y[i].qs;
6464
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6465
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6466
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5896
6467
  memset(aux32, 0, 8*sizeof(int32_t));
5897
- int8_t * restrict a = aux8;
6468
+ int8_t * GGML_RESTRICT a = aux8;
5898
6469
  uint8_t m = 1;
5899
6470
  for (int j = 0; j < QK_K; j += 128) {
5900
6471
  for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -5937,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5937
6508
 
5938
6509
  }
5939
6510
 
5940
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6511
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
5941
6512
  assert(n % QK_K == 0);
5942
6513
  assert(nrc == 1);
5943
6514
  UNUSED(nrc);
@@ -5945,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5945
6516
  UNUSED(by);
5946
6517
  UNUSED(bs);
5947
6518
 
5948
- const block_q4_K * restrict x = vx;
5949
- const block_q8_K * restrict y = vy;
6519
+ const block_q4_K * GGML_RESTRICT x = vx;
6520
+ const block_q8_K * GGML_RESTRICT y = vy;
5950
6521
 
5951
6522
  const int nb = n / QK_K;
5952
6523
 
@@ -5981,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5981
6552
 
5982
6553
  const uint8_t * scales = (const uint8_t *)utmp;
5983
6554
 
5984
- const uint8_t * restrict q4 = x[i].qs;
5985
- const int8_t * restrict q8 = y[i].qs;
6555
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6556
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5986
6557
 
5987
6558
  const int vector_length = ggml_cpu_get_sve_cnt()*8;
5988
6559
  const svuint8_t m4b = svdup_n_u8(0xf);
@@ -6069,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6069
6640
 
6070
6641
  const uint8_t * scales = (const uint8_t *)utmp;
6071
6642
 
6072
- const uint8_t * restrict q4 = x[i].qs;
6073
- const int8_t * restrict q8 = y[i].qs;
6643
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6644
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6074
6645
 
6075
6646
  int32_t sumi1 = 0;
6076
6647
  int32_t sumi2 = 0;
@@ -6108,8 +6679,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6108
6679
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6109
6680
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
6110
6681
 
6111
- const uint8_t * restrict q4 = x[i].qs;
6112
- const int8_t * restrict q8 = y[i].qs;
6682
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6113
6684
 
6114
6685
  // Process scales and mins
6115
6686
  memcpy(utmp, x[i].scales, 12);
@@ -6121,7 +6692,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6121
6692
 
6122
6693
  // Sum mins * q8sums
6123
6694
  int32_t sumi = 0;
6124
- const int16_t * restrict q8sums = y[i].bsums;
6695
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
6125
6696
  const uint8_t * m = (const uint8_t *)&utmp[2];
6126
6697
  for (int j = 0; j < 16; j += 2) {
6127
6698
  sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -6220,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6220
6791
  utmp[2] = uaux;
6221
6792
  utmp[0] &= kmask1;
6222
6793
 
6223
- const uint8_t * restrict q4 = x[i].qs;
6224
- const int8_t * restrict q8 = y[i].qs;
6794
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6225
6796
 
6226
6797
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
6227
6798
 
@@ -6279,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6279
6850
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6280
6851
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6281
6852
 
6282
- const uint8_t * restrict q4 = x[i].qs;
6283
- const int8_t * restrict q8 = y[i].qs;
6853
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6854
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6284
6855
 
6285
6856
  memcpy(utmp, x[i].scales, 12);
6286
6857
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6380,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6380
6951
  vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
6381
6952
  sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
6382
6953
 
6383
- const uint8_t * restrict q4 = x[i].qs;
6384
- const int8_t * restrict q8 = y[i].qs;
6954
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6955
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6385
6956
 
6386
6957
  vl = 32;
6387
6958
 
@@ -6482,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6482
7053
  vector signed int vsumi2 = v0;
6483
7054
  vector signed int vsumi3 = v0;
6484
7055
 
6485
- const uint8_t * restrict q4 = x[i].qs;
6486
- const int8_t * restrict q8 = y[i].qs;
7056
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7057
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6487
7058
 
6488
7059
  for (int j = 0; j < QK_K/64; j+=2) {
6489
7060
  __builtin_prefetch(q4, 0, 1);
@@ -6574,8 +7145,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6574
7145
  utmp[2] = uaux;
6575
7146
  utmp[0] &= kmask1;
6576
7147
 
6577
- const uint8_t * restrict q4 = x[i].qs;
6578
- const int8_t * restrict q8 = y[i].qs;
7148
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7149
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6579
7150
 
6580
7151
  const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
6581
7152
  const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
@@ -6622,6 +7193,77 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6622
7193
 
6623
7194
 
6624
7195
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
7196
+ #elif defined(__VXE__) || defined(__VXE2__)
7197
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
7198
+ const int32x4_t v_z = vec_splat_s32(0);
7199
+
7200
+ uint8x16_t v_x[2];
7201
+ int8x16_t v_xl[2];
7202
+ int8x16_t v_y[2];
7203
+
7204
+ float sumf = 0;
7205
+
7206
+ for (int i = 0; i < nb; ++i) {
7207
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7208
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7209
+
7210
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
7211
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
7212
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
7213
+
7214
+ memcpy(utmp, x[i].scales, 12);
7215
+
7216
+ uint32x4_t v_mins8 = { 0 };
7217
+ v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
7218
+ v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
7219
+
7220
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7221
+ utmp[0] &= kmask1;
7222
+
7223
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
7224
+
7225
+ const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
7226
+ const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
7227
+ const int32x4_t v_mins = v_minso + v_minse;
7228
+ sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
7229
+
7230
+ const uint8_t * scales = (const uint8_t *)utmp;
7231
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
7232
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
7233
+
7234
+ int32_t sumi1 = 0;
7235
+ int32_t sumi2 = 0;
7236
+
7237
+ for (int j = 0; j < QK_K/64; ++j) {
7238
+ v_x[0] = vec_xl(0 , x0);
7239
+ v_x[1] = vec_xl(16, x0);
7240
+ x0 += 32;
7241
+
7242
+ v_y[0] = vec_xl(0 , y0);
7243
+ v_y[1] = vec_xl(16, y0);
7244
+ y0 += 32;
7245
+
7246
+ v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
7247
+ v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
7248
+
7249
+ const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
7250
+ sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
7251
+
7252
+ v_y[0] = vec_xl(0 , y0);
7253
+ v_y[1] = vec_xl(16, y0);
7254
+ y0 += 32;
7255
+
7256
+ v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
7257
+ v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
7258
+
7259
+ const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
7260
+ sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
7261
+ }
7262
+
7263
+ sumf += d * (sumi1 + sumi2);
7264
+ }
7265
+
7266
+ *s = sumf;
6625
7267
  #else
6626
7268
 
6627
7269
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -6635,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6635
7277
 
6636
7278
  float sumf = 0;
6637
7279
  for (int i = 0; i < nb; ++i) {
6638
- const uint8_t * restrict q4 = x[i].qs;
6639
- const int8_t * restrict q8 = y[i].qs;
7280
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7281
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6640
7282
  memset(aux32, 0, 8*sizeof(int32_t));
6641
- int8_t * restrict a = aux8;
7283
+ int8_t * GGML_RESTRICT a = aux8;
6642
7284
  for (int j = 0; j < QK_K/64; ++j) {
6643
7285
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
6644
7286
  a += 32;
@@ -6681,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6681
7323
  #endif
6682
7324
  }
6683
7325
 
6684
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7326
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
6685
7327
  assert(n % QK_K == 0);
6686
7328
  assert(nrc == 1);
6687
7329
  UNUSED(nrc);
@@ -6689,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6689
7331
  UNUSED(by);
6690
7332
  UNUSED(bs);
6691
7333
 
6692
- const block_q5_K * restrict x = vx;
6693
- const block_q8_K * restrict y = vy;
7334
+ const block_q5_K * GGML_RESTRICT x = vx;
7335
+ const block_q8_K * GGML_RESTRICT y = vy;
6694
7336
 
6695
7337
  const int nb = n / QK_K;
6696
7338
 
@@ -6732,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6732
7374
 
6733
7375
  const uint8_t * scales = (const uint8_t *)utmp;
6734
7376
 
6735
- const uint8_t * restrict q5 = x[i].qs;
6736
- const uint8_t * restrict qh = x[i].qh;
6737
- const int8_t * restrict q8 = y[i].qs;
7377
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7378
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6738
7380
 
6739
7381
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
6740
7382
 
@@ -6779,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6779
7421
  float summs = 0.f;
6780
7422
 
6781
7423
  for (int i = 0; i < nb; ++i) {
6782
- const uint8_t * restrict q5 = x[i].qs;
6783
- const int8_t * restrict q8 = y[i].qs;
7424
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7425
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6784
7426
 
6785
7427
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6786
7428
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -6863,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6863
7505
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6864
7506
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6865
7507
 
6866
- const uint8_t * restrict q5 = x[i].qs;
6867
- const int8_t * restrict q8 = y[i].qs;
7508
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7509
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6868
7510
 
6869
7511
  memcpy(utmp, x[i].scales, 12);
6870
7512
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6955,9 +7597,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6955
7597
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6956
7598
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
6957
7599
 
6958
- const uint8_t * restrict q5 = x[i].qs;
6959
- const uint8_t * restrict qh = x[i].qh;
6960
- const int8_t * restrict q8 = y[i].qs;
7600
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7601
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7602
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6961
7603
 
6962
7604
  // Process scales and mins
6963
7605
  memcpy(utmp, x[i].scales, 12);
@@ -6969,7 +7611,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6969
7611
 
6970
7612
  // Sum mins * q8sums
6971
7613
  int32_t sumi_mins = 0;
6972
- const int16_t * restrict q8sums = y[i].bsums;
7614
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
6973
7615
  const uint8_t * m = (const uint8_t *)&utmp[2];
6974
7616
  for (int j = 0; j < 16; j += 2) {
6975
7617
  sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -7073,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7073
7715
 
7074
7716
  vl = 8;
7075
7717
 
7076
- const uint8_t * restrict q5 = x[i].qs;
7077
- const uint8_t * restrict hm = x[i].qh;
7078
- const int8_t * restrict q8 = y[i].qs;
7718
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7719
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
7720
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7079
7721
 
7080
7722
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7081
7723
  const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -7214,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7214
7856
  vector signed int vsumi2 = v0;
7215
7857
  vector signed int vsumi3 = v0;
7216
7858
 
7217
- const uint8_t * restrict q5 = x[i].qs;
7218
- const int8_t * restrict q8 = y[i].qs;
7859
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7860
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7219
7861
 
7220
7862
  for (int j = 0; j < QK_K/64; ++j) {
7221
7863
  __builtin_prefetch(q5, 0, 1);
@@ -7287,8 +7929,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7287
7929
 
7288
7930
  for (int i = 0; i < nb; ++i) {
7289
7931
 
7290
- const uint8_t * restrict q5 = x[i].qs;
7291
- const int8_t * restrict q8 = y[i].qs;
7932
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7933
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7292
7934
 
7293
7935
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7294
7936
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -7351,7 +7993,94 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7351
7993
  acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
7352
7994
 
7353
7995
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
7996
+ #elif defined(__VXE__) || defined(__VXE2__)
7997
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
7998
+ const uint8x16_t v_1m = vec_splat_u8(0x01);
7999
+ const uint8x16_t v_2m = vec_splat_u8(0x02);
7354
8000
 
8001
+ const int32x4_t v_z = vec_splat_s32(0);
8002
+
8003
+ const uchar8x16_t v_minsm = {
8004
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
8005
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
8006
+ };
8007
+
8008
+ int8x16_t q5b[4];
8009
+ uint8x16_t q5h[4];
8010
+
8011
+ uint8x16_t v_xl[2];
8012
+ uint8x16_t v_xh[2];
8013
+ int8x16_t v_y[4];
8014
+
8015
+ float sumf = 0;
8016
+
8017
+ for (int i = 0; i < nb; ++i) {
8018
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8019
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
8020
+
8021
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8022
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
8023
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
8024
+
8025
+ memcpy(utmp, x[i].scales, 12);
8026
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
8027
+ const uint32_t uaux = utmp[1] & kmask1;
8028
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
8029
+ utmp[2] = uaux;
8030
+ utmp[0] &= kmask1;
8031
+
8032
+ const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
8033
+ const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
8034
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
8035
+
8036
+ const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
8037
+ const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
8038
+ const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
8039
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8040
+
8041
+ const uint8_t * scales = (const uint8_t *)utmp;
8042
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
8043
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8044
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8045
+
8046
+ v_xh[0] = vec_xl(0 , x0h);
8047
+ v_xh[1] = vec_xl(16, x0h);
8048
+
8049
+ int32_t sumi = 0;
8050
+ for (int j = 0; j < QK_K/64; ++j) {
8051
+ v_xl[0] = vec_xl(0 , x0l);
8052
+ v_xl[1] = vec_xl(16, x0l);
8053
+ x0l += 32;
8054
+
8055
+ v_y[0] = vec_xl(0 , y0);
8056
+ v_y[1] = vec_xl(16, y0);
8057
+ v_y[2] = vec_xl(32, y0);
8058
+ v_y[3] = vec_xl(48, y0);
8059
+ y0 += 64;
8060
+
8061
+ q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
8062
+ q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
8063
+ q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
8064
+ q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
8065
+ v_xh[0] = vec_sr(v_xh[0], 2);
8066
+ v_xh[1] = vec_sr(v_xh[1], 2);
8067
+
8068
+ q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
8069
+ q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
8070
+ q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
8071
+ q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
8072
+
8073
+ int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
8074
+ int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
8075
+
8076
+ sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
8077
+ sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
8078
+ }
8079
+
8080
+ sumf += d * sumi - dmin * mins;
8081
+ }
8082
+
8083
+ *s = sumf;
7355
8084
  #else
7356
8085
 
7357
8086
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -7365,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7365
8094
 
7366
8095
  float sumf = 0;
7367
8096
  for (int i = 0; i < nb; ++i) {
7368
- const uint8_t * restrict q4 = x[i].qs;
7369
- const uint8_t * restrict hm = x[i].qh;
7370
- const int8_t * restrict q8 = y[i].qs;
8097
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
8098
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
8099
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7371
8100
  memset(aux32, 0, 8*sizeof(int32_t));
7372
- int8_t * restrict a = aux8;
8101
+ int8_t * GGML_RESTRICT a = aux8;
7373
8102
  uint8_t m = 1;
7374
8103
  for (int j = 0; j < QK_K/64; ++j) {
7375
8104
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -7416,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7416
8145
  #endif
7417
8146
  }
7418
8147
 
7419
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8148
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
7420
8149
  assert(n % QK_K == 0);
7421
8150
  assert(nrc == 1);
7422
8151
  UNUSED(nrc);
@@ -7424,12 +8153,161 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7424
8153
  UNUSED(by);
7425
8154
  UNUSED(bs);
7426
8155
 
7427
- const block_q6_K * restrict x = vx;
7428
- const block_q8_K * restrict y = vy;
8156
+ const block_q6_K * GGML_RESTRICT x = vx;
8157
+ const block_q8_K * GGML_RESTRICT y = vy;
7429
8158
 
7430
8159
  const int nb = n / QK_K;
7431
8160
 
7432
- #ifdef __ARM_NEON
8161
+ #ifdef __ARM_FEATURE_SVE
8162
+ const int vector_length = ggml_cpu_get_sve_cnt()*8;
8163
+ float sum = 0;
8164
+ svuint8_t m4b = svdup_n_u8(0xf);
8165
+ svint32_t vzero = svdup_n_s32(0);
8166
+ svuint8_t mone = svdup_n_u8(0x30);
8167
+ svint8_t q6bytes_1, q6bytes_2, q6bytes_3, q6bytes_4;
8168
+ svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
8169
+
8170
+ for (int i = 0; i < nb; ++i) {
8171
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8172
+
8173
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8174
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8175
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8176
+
8177
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8178
+
8179
+ const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
8180
+ const svint16_t q8sums_1 = svld1_s16(pg16_8, y[i].bsums);
8181
+ const svint16_t q8sums_2 = svld1_s16(pg16_8, y[i].bsums + 8);
8182
+ const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale));
8183
+ const svint16_t q6scales_2 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale + 8));
8184
+ const svint64_t prod = svdup_n_s64(0);
8185
+ int32_t isum_mins = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(prod, q8sums_1, q6scales_1),
8186
+ svdot_s64(prod, q8sums_2, q6scales_2)));
8187
+ int32_t isum = 0;
8188
+
8189
+ switch (vector_length) {
8190
+ case 128:
8191
+ {
8192
+ const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
8193
+ const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16);
8194
+ svint32_t isum_tmp = svdup_n_s32(0);
8195
+ for (int j = 0; j < QK_K/128; ++j) {
8196
+ svuint8_t qhbits_1 = svld1_u8(pg8_16, qh);
8197
+ svuint8_t qhbits_2 = svld1_u8(pg8_16, qh+16);
8198
+ qh += 32;
8199
+ svuint8_t q6bits_1 = svld1_u8(pg8_16, q6);
8200
+ svuint8_t q6bits_2 = svld1_u8(pg8_16, q6+16);
8201
+ svuint8_t q6bits_3 = svld1_u8(pg8_16, q6+32);
8202
+ svuint8_t q6bits_4 = svld1_u8(pg8_16, q6+48);
8203
+ q6 += 64;
8204
+ svint8_t q8bytes_1 = svld1_s8(pg8_16, q8);
8205
+ svint8_t q8bytes_2 = svld1_s8(pg8_16, q8+16);
8206
+ svint8_t q8bytes_3 = svld1_s8(pg8_16, q8+32);
8207
+ svint8_t q8bytes_4 = svld1_s8(pg8_16, q8+48);
8208
+ q8 += 64;
8209
+
8210
+ q6h_1 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 4));
8211
+ q6h_2 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 4));
8212
+ q6h_3 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 2));
8213
+ q6h_4 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 2));
8214
+ q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_1, m4b), q6h_1));
8215
+ q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_2, m4b), q6h_2));
8216
+ q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_3, m4b), q6h_3));
8217
+ q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_4, m4b), q6h_4));
8218
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]);
8219
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]);
8220
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]);
8221
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]);
8222
+
8223
+ scale += 4;
8224
+ q8bytes_1 = svld1_s8(pg8_16, q8);
8225
+ q8bytes_2 = svld1_s8(pg8_16, q8+16);
8226
+ q8bytes_3 = svld1_s8(pg8_16, q8+32);
8227
+ q8bytes_4 = svld1_s8(pg8_16, q8+48);
8228
+ q8 += 64;
8229
+
8230
+ q6h_1 = svand_u8_x(pg16_8, mone, qhbits_1);
8231
+ q6h_2 = svand_u8_x(pg16_8, mone, qhbits_2);
8232
+ q6h_3 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_1, 2));
8233
+ q6h_4 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_2, 2));
8234
+ q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_1, 4), q6h_1));
8235
+ q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_2, 4), q6h_2));
8236
+ q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_3, 4), q6h_3));
8237
+ q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_4, 4), q6h_4));
8238
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]);
8239
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]);
8240
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]);
8241
+ isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]);
8242
+ scale += 4;
8243
+ }
8244
+ isum += svaddv_s32(pg32_4, isum_tmp);
8245
+ sum += d_all * y[i].d * (isum - 32 * isum_mins);
8246
+ }
8247
+ break;
8248
+ case 256:
8249
+ case 512:
8250
+ {
8251
+ const svbool_t pg8_2 = svptrue_pat_b8(SV_VL2);
8252
+ const svbool_t pg32_8 = svptrue_pat_b32(SV_VL8);
8253
+ const svbool_t pg8_32 = svptrue_pat_b8(SV_VL32);
8254
+ svint32_t isum_tmp = svdup_n_s32(0);
8255
+ for (int j = 0; j < QK_K/128; j++) {
8256
+ svuint8_t qhbits_1 = svld1_u8(pg8_32, qh);
8257
+ qh += 32;
8258
+ svuint8_t q6bits_1 = svld1_u8(pg8_32, q6);
8259
+ svuint8_t q6bits_2 = svld1_u8(pg8_32, q6+32);
8260
+ q6 += 64;
8261
+ svint8_t q8bytes_1 = svld1_s8(pg8_32, q8);
8262
+ svint8_t q8bytes_2 = svld1_s8(pg8_32, q8+32);
8263
+ svint8_t q8bytes_3 = svld1_s8(pg8_32, q8+64);
8264
+ svint8_t q8bytes_4 = svld1_s8(pg8_32, q8+96);
8265
+ q8 += 128;
8266
+ q6h_1 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 4));
8267
+ q6h_2 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 2));
8268
+ q6h_3 = svand_u8_x(pg8_32, mone, qhbits_1);
8269
+ q6h_4 = svand_u8_x(pg8_32, mone, svlsr_n_u8_x(pg8_32, qhbits_1, 2));
8270
+ q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_1, m4b), q6h_1));
8271
+ q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_2, m4b), q6h_2));
8272
+ q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_1, 4), q6h_3));
8273
+ q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_2, 4), q6h_4));
8274
+
8275
+ svint8_t scale_lane_1_tmp = svld1_s8(pg8_2, scale);
8276
+ scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp);
8277
+ scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp);
8278
+ svint8_t scale_lane_2_tmp = svld1_s8(pg8_2, scale+2);
8279
+ scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp);
8280
+ scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp);
8281
+ svint8_t scale_lane_3_tmp = svld1_s8(pg8_2, scale+4);
8282
+ scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp);
8283
+ scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp);
8284
+ svint8_t scale_lane_4_tmp = svld1_s8(pg8_2, scale+6);
8285
+ scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp);
8286
+ scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp);
8287
+ svint32_t scale_lane_1 = svunpklo_s32(svunpklo_s16(scale_lane_1_tmp));
8288
+ svint32_t scale_lane_2 = svunpklo_s32(svunpklo_s16(scale_lane_2_tmp));
8289
+ svint32_t scale_lane_3 = svunpklo_s32(svunpklo_s16(scale_lane_3_tmp));
8290
+ svint32_t scale_lane_4 = svunpklo_s32(svunpklo_s16(scale_lane_4_tmp));
8291
+
8292
+ isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale_lane_1);
8293
+ isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale_lane_2);
8294
+ isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale_lane_3);
8295
+ isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale_lane_4);
8296
+ scale += 8;
8297
+ }
8298
+ isum += svaddv_s32(pg32_8, isum_tmp);
8299
+ sum += d_all * y[i].d * (isum - 32 * isum_mins);
8300
+ }
8301
+ break;
8302
+ default:
8303
+ assert(false && "Unsupported vector length");
8304
+ break;
8305
+ }
8306
+ }
8307
+
8308
+ *s = sum;
8309
+
8310
+ #elif __ARM_NEON
7433
8311
  float sum = 0;
7434
8312
 
7435
8313
  const uint8x16_t m4b = vdupq_n_u8(0xF);
@@ -7445,11 +8323,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7445
8323
 
7446
8324
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
7447
8325
 
7448
- const uint8_t * restrict q6 = x[i].ql;
7449
- const uint8_t * restrict qh = x[i].qh;
7450
- const int8_t * restrict q8 = y[i].qs;
8326
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8327
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8328
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7451
8329
 
7452
- const int8_t * restrict scale = x[i].scales;
8330
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
7453
8331
 
7454
8332
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
7455
8333
  const int8x16_t scales = vld1q_s8(scale);
@@ -7536,9 +8414,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7536
8414
 
7537
8415
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7538
8416
 
7539
- const uint8_t * restrict q4 = x[i].ql;
7540
- const uint8_t * restrict qh = x[i].qh;
7541
- const int8_t * restrict q8 = y[i].qs;
8417
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8418
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8419
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7542
8420
 
7543
8421
  const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
7544
8422
 
@@ -7614,9 +8492,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7614
8492
 
7615
8493
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7616
8494
 
7617
- const uint8_t * restrict q4 = x[i].ql;
7618
- const uint8_t * restrict qh = x[i].qh;
7619
- const int8_t * restrict q8 = y[i].qs;
8495
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8496
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8497
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7620
8498
 
7621
8499
  // handle the q6_k -32 offset separately using bsums
7622
8500
  const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
@@ -7715,8 +8593,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7715
8593
 
7716
8594
  for (int i = 0; i < nb; ++i) {
7717
8595
  // Unpack 6-bit quantized data into aux8 (unchanged)
7718
- const uint8_t * restrict q4 = x[i].ql;
7719
- const uint8_t * restrict qh = x[i].qh;
8596
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8597
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7720
8598
  int8_t * a = aux8;
7721
8599
  for (int j = 0; j < QK_K; j += 128) {
7722
8600
  for (int l = 0; l < 32; ++l) {
@@ -7730,8 +8608,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7730
8608
  qh += 32;
7731
8609
  }
7732
8610
 
7733
- const int8_t * restrict a_ptr = aux8;
7734
- const int8_t * restrict q8 = y[i].qs;
8611
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
8612
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7735
8613
  v128_t acc0 = wasm_i32x4_splat(0);
7736
8614
  v128_t acc1 = wasm_i32x4_splat(0);
7737
8615
 
@@ -7794,11 +8672,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7794
8672
 
7795
8673
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7796
8674
 
7797
- const uint8_t * restrict q6 = x[i].ql;
7798
- const uint8_t * restrict qh = x[i].qh;
7799
- const int8_t * restrict q8 = y[i].qs;
8675
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8676
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8677
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7800
8678
 
7801
- const int8_t * restrict scale = x[i].scales;
8679
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
7802
8680
 
7803
8681
  size_t vl;
7804
8682
 
@@ -7900,10 +8778,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7900
8778
  vector signed int vsumi6 = v0;
7901
8779
  vector signed int vsumi7 = v0;
7902
8780
 
7903
- const uint8_t * restrict q6 = x[i].ql;
7904
- const uint8_t * restrict qh = x[i].qh;
7905
- const int8_t * restrict qs = x[i].scales;
7906
- const int8_t * restrict q8 = y[i].qs;
8781
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8782
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8783
+ const int8_t * GGML_RESTRICT qs = x[i].scales;
8784
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7907
8785
 
7908
8786
  for (int j = 0; j < QK_K/128; ++j) {
7909
8787
  __builtin_prefetch(q6, 0, 0);
@@ -8019,9 +8897,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8019
8897
 
8020
8898
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8021
8899
 
8022
- const uint8_t * restrict q4 = x[i].ql;
8023
- const uint8_t * restrict qh = x[i].qh;
8024
- const int8_t * restrict q8 = y[i].qs;
8900
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8901
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8902
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8025
8903
 
8026
8904
  const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
8027
8905
  const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
@@ -8068,7 +8946,130 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8068
8946
  }
8069
8947
 
8070
8948
  *s = hsum_float_8(acc);
8949
+ #elif defined(__VXE__) || defined(__VXE2__)
8950
+ float sum = 0;
8951
+
8952
+ // Lower 4-bit and upper 2-bit masks
8953
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
8954
+ const uint8x16_t v_um = vec_splat_u8(0x03);
8955
+
8956
+ const int32x4_t v_z = vec_splat_s32(0);
8071
8957
 
8958
+ int8x16_t q6b[4];
8959
+ uint8x16_t q6h[4];
8960
+
8961
+ uint8x16_t v_xl[4];
8962
+ uint8x16_t v_xh[2];
8963
+ int8x16_t v_y[4];
8964
+
8965
+ for (int i = 0; i < nb; ++i) {
8966
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8967
+
8968
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
8969
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8970
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8971
+
8972
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8973
+
8974
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8975
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
8976
+
8977
+ const int8x16_t v_scale = vec_xl(0, scale);
8978
+ const int16x8_t v_scalel = vec_unpackh(v_scale);
8979
+ const int16x8_t v_scaleh = vec_unpackl(v_scale);
8980
+
8981
+ const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
8982
+ const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
8983
+ const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
8984
+ const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
8985
+ const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
8986
+
8987
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8988
+
8989
+ int32_t isum = 0;
8990
+ for (int j = 0; j < QK_K/128; ++j) {
8991
+ // Load model upper 2 bits
8992
+ v_xh[0] = vec_xl(0 , x0h);
8993
+ v_xh[1] = vec_xl(16, x0h);
8994
+ x0h += 32;
8995
+
8996
+ // Load model lower 4 bits
8997
+ v_xl[0] = vec_xl(0 , x0l);
8998
+ v_xl[1] = vec_xl(16, x0l);
8999
+ v_xl[2] = vec_xl(32, x0l);
9000
+ v_xl[3] = vec_xl(48, x0l);
9001
+ x0l += 64;
9002
+
9003
+ // Load activation quants
9004
+ v_y[0] = vec_xl(0 , y0);
9005
+ v_y[1] = vec_xl(16, y0);
9006
+ v_y[2] = vec_xl(32, y0);
9007
+ v_y[3] = vec_xl(48, y0);
9008
+ y0 += 64;
9009
+
9010
+ q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
9011
+ q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
9012
+ uint8x16_t shifted = vec_sr(v_xh[0], 2);
9013
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
9014
+ shifted = vec_sr(v_xh[1], 2);
9015
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
9016
+
9017
+ q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
9018
+ q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
9019
+ q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
9020
+ q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
9021
+
9022
+ int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
9023
+ int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
9024
+ int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
9025
+ int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
9026
+
9027
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
9028
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
9029
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
9030
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
9031
+
9032
+ scale += 4;
9033
+
9034
+
9035
+ // Load activation quants
9036
+ v_y[0] = vec_xl(0 , y0);
9037
+ v_y[1] = vec_xl(16, y0);
9038
+ v_y[2] = vec_xl(32, y0);
9039
+ v_y[3] = vec_xl(48, y0);
9040
+ y0 += 64;
9041
+
9042
+ shifted = vec_sr(v_xh[0], 4);
9043
+ q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
9044
+ shifted = vec_sr(v_xh[1], 4);
9045
+ q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
9046
+ shifted = vec_sr(v_xh[0], 6);
9047
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
9048
+ shifted = vec_sr(v_xh[1], 6);
9049
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
9050
+
9051
+ q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
9052
+ q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
9053
+ q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
9054
+ q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
9055
+
9056
+ summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
9057
+ summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
9058
+ summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
9059
+ summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
9060
+
9061
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
9062
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
9063
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
9064
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
9065
+
9066
+ scale += 4;
9067
+ }
9068
+
9069
+ sum += d_all * y[i].d * (isum - 32 * mins);
9070
+ }
9071
+
9072
+ *s = sum;
8072
9073
  #else
8073
9074
 
8074
9075
  int8_t aux8[QK_K];
@@ -8079,11 +9080,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8079
9080
 
8080
9081
  float sumf = 0;
8081
9082
  for (int i = 0; i < nb; ++i) {
8082
- const uint8_t * restrict q4 = x[i].ql;
8083
- const uint8_t * restrict qh = x[i].qh;
8084
- const int8_t * restrict q8 = y[i].qs;
9083
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
9084
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
9085
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8085
9086
  memset(aux32, 0, 8*sizeof(int32_t));
8086
- int8_t * restrict a = aux8;
9087
+ int8_t * GGML_RESTRICT a = aux8;
8087
9088
  for (int j = 0; j < QK_K; j += 128) {
8088
9089
  for (int l = 0; l < 32; ++l) {
8089
9090
  a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -8151,7 +9152,7 @@ static const int8_t keven_signs_q2xs[1024] = {
8151
9152
  };
8152
9153
  #endif
8153
9154
 
8154
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9155
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8155
9156
  assert(n % QK_K == 0);
8156
9157
  assert(nrc == 1);
8157
9158
  UNUSED(nrc);
@@ -8159,8 +9160,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8159
9160
  UNUSED(by);
8160
9161
  UNUSED(bs);
8161
9162
 
8162
- const block_iq2_xxs * restrict x = vx;
8163
- const block_q8_K * restrict y = vy;
9163
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
9164
+ const block_q8_K * GGML_RESTRICT y = vy;
8164
9165
 
8165
9166
  const int nb = n / QK_K;
8166
9167
 
@@ -8178,8 +9179,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8178
9179
  float sumf = 0;
8179
9180
  for (int i = 0; i < nb; ++i) {
8180
9181
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8181
- const uint16_t * restrict q2 = x[i].qs;
8182
- const int8_t * restrict q8 = y[i].qs;
9182
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9183
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8183
9184
  float sumf1 = 0, sumf2 = 0;
8184
9185
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8185
9186
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -8215,8 +9216,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8215
9216
  __m256 accumf = _mm256_setzero_ps();
8216
9217
  for (int i = 0; i < nb; ++i) {
8217
9218
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8218
- const uint16_t * restrict q2 = x[i].qs;
8219
- const int8_t * restrict q8 = y[i].qs;
9219
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9220
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8220
9221
  __m256i sumi1 = _mm256_setzero_si256();
8221
9222
  __m256i sumi2 = _mm256_setzero_si256();
8222
9223
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -8256,8 +9257,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8256
9257
  __m256 accumf = _mm256_setzero_ps();
8257
9258
  for (int i = 0; i < nb; ++i) {
8258
9259
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8259
- const uint16_t * restrict q2 = x[i].qs;
8260
- const int8_t * restrict q8 = y[i].qs;
9260
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9261
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8261
9262
  __m128i sumi1_0 = _mm_setzero_si128();
8262
9263
  __m128i sumi1_1 = _mm_setzero_si128();
8263
9264
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -8321,8 +9322,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8321
9322
  vector signed int vsumi2 = v0;
8322
9323
  vector signed int vsumi3 = v0;
8323
9324
 
8324
- const uint16_t * restrict q2 = x[i].qs;
8325
- const int8_t * restrict q8 = y[i].qs;
9325
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9326
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8326
9327
 
8327
9328
  for (int j = 0; j < QK_K/32; j += 2) {
8328
9329
  __builtin_prefetch(q2, 0, 1);
@@ -8398,8 +9399,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8398
9399
  __m256 accumf = (__m256)__lasx_xvldi(0);
8399
9400
  for (int i = 0; i < nb; ++i) {
8400
9401
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8401
- const uint16_t * restrict q2 = x[i].qs;
8402
- const int8_t * restrict q8 = y[i].qs;
9402
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9403
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8403
9404
  __m256i sumi1 = __lasx_xvldi(0);
8404
9405
  __m256i sumi2 = __lasx_xvldi(0);
8405
9406
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -8429,7 +9430,57 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8429
9430
  }
8430
9431
 
8431
9432
  *s = 0.125f * hsum_float_8(accumf);
8432
-
9433
+ //#elif defined(__VXE__) || defined(__VXE2__)
9434
+ // const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9435
+ //
9436
+ // uint32_t aux32[4];
9437
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
9438
+ //
9439
+ // float sumf = 0;
9440
+ //
9441
+ // for (int i = 0; i < nb; ++i) {
9442
+ // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9443
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9444
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
9445
+ //
9446
+ // float sumf1 = 0, sumf2 = 0;
9447
+ //
9448
+ // for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
9449
+ // int8x16_t q8b0 = vec_xl( 0, q8);
9450
+ // int8x16_t qb81 = vec_xl(16, q8);
9451
+ // int8x16_t q8b2 = vec_xl(32, q8);
9452
+ // int8x16_t q8b3 = vec_xl(48, q8);
9453
+ // q8 += 64;
9454
+ //
9455
+ // memcpy(aux32, q2, 4 * sizeof(uint32_t));
9456
+ // q2 += 8;
9457
+ //
9458
+ // int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
9459
+ // int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
9460
+ // int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
9461
+ // int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
9462
+ //
9463
+ // int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
9464
+ // int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
9465
+ // int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
9466
+ // int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
9467
+ //
9468
+ // q2u0 = vec_mul(q2u0, q2s0);
9469
+ // q2u1 = vec_mul(q2u1, q2s1);
9470
+ // q2u2 = vec_mul(q2u2, q2s2);
9471
+ // q2u3 = vec_mul(q2u3, q2s3);
9472
+ //
9473
+ // const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
9474
+ // const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
9475
+ //
9476
+ // sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
9477
+ // sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
9478
+ // }
9479
+ //
9480
+ // sumf += d * (sumf1 + sumf2);
9481
+ // }
9482
+ //
9483
+ // *s = 0.25f * sumf;
8433
9484
  #else
8434
9485
 
8435
9486
  uint32_t aux32[2];
@@ -8438,8 +9489,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8438
9489
  float sumf = 0.f;
8439
9490
  for (int i = 0; i < nb; ++i) {
8440
9491
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8441
- const uint16_t * restrict q2 = x[i].qs;
8442
- const int8_t * restrict q8 = y[i].qs;
9492
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9493
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8443
9494
  int32_t bsum = 0;
8444
9495
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
8445
9496
  memcpy(aux32, q2, 2*sizeof(uint32_t));
@@ -8462,7 +9513,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8462
9513
  #endif
8463
9514
  }
8464
9515
 
8465
- void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9516
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8466
9517
  assert(n % QK_K == 0);
8467
9518
  assert(nrc == 1);
8468
9519
  UNUSED(nrc);
@@ -8470,8 +9521,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8470
9521
  UNUSED(by);
8471
9522
  UNUSED(bs);
8472
9523
 
8473
- const block_iq2_xs * restrict x = vx;
8474
- const block_q8_K * restrict y = vy;
9524
+ const block_iq2_xs * GGML_RESTRICT x = vx;
9525
+ const block_q8_K * GGML_RESTRICT y = vy;
8475
9526
 
8476
9527
  const int nb = n / QK_K;
8477
9528
 
@@ -8488,8 +9539,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8488
9539
  float sumf = 0;
8489
9540
  for (int i = 0; i < nb; ++i) {
8490
9541
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8491
- const uint16_t * restrict q2 = x[i].qs;
8492
- const int8_t * restrict q8 = y[i].qs;
9542
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9543
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8493
9544
  const uint8x8_t scales8 = vld1_u8(x[i].scales);
8494
9545
  const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
8495
9546
  const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
@@ -8566,8 +9617,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8566
9617
  __m256 accumf = _mm256_setzero_ps();
8567
9618
  for (int i = 0; i < nb; ++i) {
8568
9619
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8569
- const uint16_t * restrict q2 = x[i].qs;
8570
- const int8_t * restrict q8 = y[i].qs;
9620
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9621
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8571
9622
 
8572
9623
  memcpy(&aux64, x[i].scales, 8);
8573
9624
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -8687,8 +9738,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8687
9738
  __m256 accumf = _mm256_setzero_ps();
8688
9739
  for (int i = 0; i < nb; ++i) {
8689
9740
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8690
- const uint16_t * restrict q2 = x[i].qs;
8691
- const int8_t * restrict q8 = y[i].qs;
9741
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9742
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8692
9743
 
8693
9744
  memcpy(&aux64, x[i].scales, 8);
8694
9745
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -8842,8 +9893,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8842
9893
  __m256 accumf = (__m256)__lasx_xvldi(0);
8843
9894
  for (int i = 0; i < nb; ++i) {
8844
9895
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8845
- const uint16_t * restrict q2 = x[i].qs;
8846
- const int8_t * restrict q8 = y[i].qs;
9896
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9897
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8847
9898
 
8848
9899
  memcpy(&aux64, x[i].scales, 8);
8849
9900
  __m128i stmp = __lsx_vreplgr2vr_d(aux64);
@@ -8940,9 +9991,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8940
9991
  vector signed int vsumi2 = v0;
8941
9992
  vector signed int vsumi3 = v0;
8942
9993
 
8943
- const uint16_t * restrict q2 = x[i].qs;
8944
- const uint8_t * restrict sc = x[i].scales;
8945
- const int8_t * restrict q8 = y[i].qs;
9994
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9995
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9996
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8946
9997
 
8947
9998
  for (int j = 0; j < QK_K/64; ++j) {
8948
9999
  __builtin_prefetch(q2, 0, 1);
@@ -9012,9 +10063,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9012
10063
  float sumf = 0.f;
9013
10064
  for (int i = 0; i < nb; ++i) {
9014
10065
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9015
- const uint16_t * restrict q2 = x[i].qs;
9016
- const uint8_t * restrict sc = x[i].scales;
9017
- const int8_t * restrict q8 = y[i].qs;
10066
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
10067
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
10068
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9018
10069
  int32_t bsum = 0;
9019
10070
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9020
10071
  const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
@@ -9047,7 +10098,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9047
10098
  #endif
9048
10099
  }
9049
10100
 
9050
- void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10101
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9051
10102
  assert(n % QK_K == 0);
9052
10103
  assert(nrc == 1);
9053
10104
  UNUSED(nrc);
@@ -9055,8 +10106,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9055
10106
  UNUSED(by);
9056
10107
  UNUSED(bs);
9057
10108
 
9058
- const block_iq2_s * restrict x = vx;
9059
- const block_q8_K * restrict y = vy;
10109
+ const block_iq2_s * GGML_RESTRICT x = vx;
10110
+ const block_q8_K * GGML_RESTRICT y = vy;
9060
10111
 
9061
10112
  const int nb = n / QK_K;
9062
10113
 
@@ -9082,10 +10133,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9082
10133
 
9083
10134
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9084
10135
 
9085
- const uint8_t * restrict qs = x[i].qs;
9086
- const uint8_t * restrict qh = x[i].qh;
9087
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9088
- const int8_t * restrict q8 = y[i].qs;
10136
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10137
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10138
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10139
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9089
10140
 
9090
10141
  int sumi1 = 0, sumi2 = 0;
9091
10142
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9156,10 +10207,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9156
10207
  __m256 accumf = _mm256_setzero_ps();
9157
10208
  for (int i = 0; i < nb; ++i) {
9158
10209
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9159
- const uint8_t * restrict qs = x[i].qs;
9160
- const uint8_t * restrict qh = x[i].qh;
9161
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9162
- const int8_t * restrict q8 = y[i].qs;
10210
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10211
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10212
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10213
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9163
10214
 
9164
10215
  memcpy(&aux64, x[i].scales, 8);
9165
10216
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -9229,10 +10280,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9229
10280
  __m256 accumf = _mm256_setzero_ps();
9230
10281
  for (int i = 0; i < nb; ++i) {
9231
10282
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9232
- const uint8_t * restrict qs = x[i].qs;
9233
- const uint8_t * restrict qh = x[i].qh;
9234
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9235
- const int8_t * restrict q8 = y[i].qs;
10283
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10284
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10285
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10286
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9236
10287
 
9237
10288
  memcpy(&aux64, x[i].scales, 8);
9238
10289
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -9327,11 +10378,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9327
10378
  vector signed int vsumi2 = v0;
9328
10379
  vector signed int vsumi3 = v0;
9329
10380
 
9330
- const uint8_t * restrict q2 = x[i].qs;
9331
- const uint8_t * restrict qh = x[i].qh;
9332
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9333
- const uint8_t * restrict sc = x[i].scales;
9334
- const int8_t * restrict q8 = y[i].qs;
10381
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
10382
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10383
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10384
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
10385
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9335
10386
 
9336
10387
  for (int j = 0; j < QK_K/32; j += 2) {
9337
10388
  __builtin_prefetch(q2, 0, 1);
@@ -9428,10 +10479,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9428
10479
  __m256 accumf = (__m256)__lasx_xvldi(0);
9429
10480
  for (int i = 0; i < nb; ++i) {
9430
10481
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9431
- const uint8_t * restrict qs = x[i].qs;
9432
- const uint8_t * restrict qh = x[i].qh;
9433
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9434
- const int8_t * restrict q8 = y[i].qs;
10482
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10483
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10484
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10485
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9435
10486
 
9436
10487
  __m128i tmp1;
9437
10488
  memcpy(&aux64, x[i].scales, 8);
@@ -9525,7 +10576,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9525
10576
 
9526
10577
  }
9527
10578
 
9528
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10579
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9529
10580
  assert(n % QK_K == 0);
9530
10581
  assert(nrc == 1);
9531
10582
  UNUSED(nrc);
@@ -9533,8 +10584,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9533
10584
  UNUSED(by);
9534
10585
  UNUSED(bs);
9535
10586
 
9536
- const block_iq3_xxs * restrict x = vx;
9537
- const block_q8_K * restrict y = vy;
10587
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
10588
+ const block_q8_K * GGML_RESTRICT y = vy;
9538
10589
 
9539
10590
  const int nb = n / QK_K;
9540
10591
 
@@ -9550,9 +10601,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9550
10601
  float sumf = 0;
9551
10602
  for (int i = 0; i < nb; ++i) {
9552
10603
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9553
- const uint8_t * restrict q3 = x[i].qs;
9554
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9555
- const int8_t * restrict q8 = y[i].qs;
10604
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10605
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10606
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9556
10607
  float sumf1 = 0, sumf2 = 0;
9557
10608
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9558
10609
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -9588,9 +10639,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9588
10639
  __m256 accumf = _mm256_setzero_ps();
9589
10640
  for (int i = 0; i < nb; ++i) {
9590
10641
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9591
- const uint8_t * restrict q3 = x[i].qs;
9592
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9593
- const int8_t * restrict q8 = y[i].qs;
10642
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10643
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10644
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9594
10645
  __m256i sumi1 = _mm256_setzero_si256();
9595
10646
  __m256i sumi2 = _mm256_setzero_si256();
9596
10647
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9633,9 +10684,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9633
10684
  __m256 accumf = _mm256_setzero_ps();
9634
10685
  for (int i = 0; i < nb; ++i) {
9635
10686
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9636
- const uint8_t * restrict q3 = x[i].qs;
9637
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9638
- const int8_t * restrict q8 = y[i].qs;
10687
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10688
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10689
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9639
10690
  __m128i sumi1_0 = _mm_setzero_si128();
9640
10691
  __m128i sumi1_1 = _mm_setzero_si128();
9641
10692
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -9702,9 +10753,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9702
10753
  vector signed int vsumi2 = v0;
9703
10754
  vector signed int vsumi3 = v0;
9704
10755
 
9705
- const uint8_t * restrict q3 = x[i].qs;
9706
- const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
9707
- const int8_t * restrict q8 = y[i].qs;
10756
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10757
+ const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
10758
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9708
10759
 
9709
10760
  #pragma GCC unroll 1
9710
10761
  for (int j = 0; j < QK_K/32; j += 2) {
@@ -9776,9 +10827,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9776
10827
  __m256 accumf = (__m256)__lasx_xvldi(0);
9777
10828
  for (int i = 0; i < nb; ++i) {
9778
10829
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9779
- const uint8_t * restrict q3 = x[i].qs;
9780
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9781
- const int8_t * restrict q8 = y[i].qs;
10830
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10831
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10832
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9782
10833
  __m256i sumi1 = __lasx_xvldi(0);
9783
10834
  __m256i sumi2 = __lasx_xvldi(0);
9784
10835
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9821,9 +10872,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9821
10872
  float sumf = 0.f;
9822
10873
  for (int i = 0; i < nb; ++i) {
9823
10874
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9824
- const uint8_t * restrict q3 = x[i].qs;
9825
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9826
- const int8_t * restrict q8 = y[i].qs;
10875
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10876
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10877
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9827
10878
  int32_t bsum = 0;
9828
10879
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9829
10880
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
@@ -9848,7 +10899,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9848
10899
  #endif
9849
10900
  }
9850
10901
 
9851
- void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10902
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9852
10903
  assert(n % QK_K == 0);
9853
10904
  assert(nrc == 1);
9854
10905
  UNUSED(nrc);
@@ -9856,8 +10907,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9856
10907
  UNUSED(by);
9857
10908
  UNUSED(bs);
9858
10909
 
9859
- const block_iq3_s * restrict x = vx;
9860
- const block_q8_K * restrict y = vy;
10910
+ const block_iq3_s * GGML_RESTRICT x = vx;
10911
+ const block_q8_K * GGML_RESTRICT y = vy;
9861
10912
 
9862
10913
  const int nb = n / QK_K;
9863
10914
 
@@ -9894,10 +10945,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9894
10945
  float sumf = 0;
9895
10946
  for (int i = 0; i < nb; ++i) {
9896
10947
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9897
- const uint8_t * restrict qs = x[i].qs;
9898
- const uint8_t * restrict qh = x[i].qh;
9899
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9900
- const int8_t * restrict q8 = y[i].qs;
10948
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10949
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10950
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10951
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9901
10952
 
9902
10953
  memcpy(scales32, x[i].scales, 4);
9903
10954
  scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
@@ -9976,10 +11027,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9976
11027
  __m256 accumf = _mm256_setzero_ps();
9977
11028
  for (int i = 0; i < nb; ++i) {
9978
11029
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9979
- const uint8_t * restrict qs = x[i].qs;
9980
- const uint8_t * restrict qh = x[i].qh;
9981
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9982
- const int8_t * restrict q8 = y[i].qs;
11030
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11031
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11032
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
11033
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9983
11034
  __m256i sumi1 = _mm256_setzero_si256();
9984
11035
  __m256i sumi2 = _mm256_setzero_si256();
9985
11036
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10061,10 +11112,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10061
11112
  __m256 accumf = _mm256_setzero_ps();
10062
11113
  for (int i = 0; i < nb; ++i) {
10063
11114
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10064
- const uint8_t * restrict qs = x[i].qs;
10065
- const uint8_t * restrict qh = x[i].qh;
10066
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10067
- const int8_t * restrict q8 = y[i].qs;
11115
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11116
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11117
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
11118
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10068
11119
  __m128i sumi1_0 = _mm_setzero_si128();
10069
11120
  __m128i sumi1_1 = _mm_setzero_si128();
10070
11121
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -10162,11 +11213,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10162
11213
  vector float vyd = vec_splats(y[i].d);
10163
11214
  vector float vd = vec_mul(vxd, vyd);
10164
11215
 
10165
- const uint8_t * restrict q3 = x[i].qs;
10166
- const uint8_t * restrict qh = x[i].qh;
10167
- const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
10168
- const uint8_t * restrict sc = x[i].scales;
10169
- const int8_t * restrict q8 = y[i].qs;
11216
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
11217
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11218
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
11219
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
11220
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10170
11221
 
10171
11222
  vector signed int vsumi0 = v0;
10172
11223
  vector signed int vsumi1 = v0;
@@ -10273,10 +11324,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10273
11324
  __m256 accumf = (__m256)__lasx_xvldi(0);
10274
11325
  for (int i = 0; i < nb; ++i) {
10275
11326
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10276
- const uint8_t * restrict qs = x[i].qs;
10277
- const uint8_t * restrict qh = x[i].qh;
10278
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10279
- const int8_t * restrict q8 = y[i].qs;
11327
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11328
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11329
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
11330
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10280
11331
  __m256i sumi1 = __lasx_xvldi(0);
10281
11332
  __m256i sumi2 = __lasx_xvldi(0);
10282
11333
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10334,10 +11385,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10334
11385
  float sumf = 0.f;
10335
11386
  for (int i = 0; i < nb; ++i) {
10336
11387
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10337
- const uint8_t * restrict qs = x[i].qs;
10338
- const uint8_t * restrict qh = x[i].qh;
10339
- const uint8_t * restrict signs = x[i].signs;
10340
- const int8_t * restrict q8 = y[i].qs;
11388
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11389
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11390
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
11391
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10341
11392
  int32_t bsum = 0;
10342
11393
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10343
11394
  const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
@@ -10389,7 +11440,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10389
11440
  }
10390
11441
  #endif
10391
11442
 
10392
- void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11443
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10393
11444
  assert(n % QK_K == 0);
10394
11445
  assert(nrc == 1);
10395
11446
  UNUSED(nrc);
@@ -10397,8 +11448,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10397
11448
  UNUSED(by);
10398
11449
  UNUSED(bs);
10399
11450
 
10400
- const block_iq1_s * restrict x = vx;
10401
- const block_q8_K * restrict y = vy;
11451
+ const block_iq1_s * GGML_RESTRICT x = vx;
11452
+ const block_q8_K * GGML_RESTRICT y = vy;
10402
11453
 
10403
11454
  const int nb = n / QK_K;
10404
11455
 
@@ -10460,10 +11511,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10460
11511
  __m256i sumi = _mm256_setzero_si256();
10461
11512
  int sumi1 = 0;
10462
11513
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11514
+ #ifdef __BMI2__
11515
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
11516
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
11517
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11518
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11519
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11520
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11521
+ #else
10463
11522
  const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
10464
11523
  iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
10465
11524
  const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
10466
11525
  iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11526
+ #endif
10467
11527
  qs += 8;
10468
11528
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10469
11529
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
@@ -10556,10 +11616,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10556
11616
  vector signed int vsumi3 = vec_splats((int32_t)0);
10557
11617
  vector signed int vsumi8 = vec_splats((int32_t)0);
10558
11618
 
10559
- const uint8_t * restrict q1 = x[i].qs;
10560
- const uint16_t * restrict qh = x[i].qh;
10561
- const int8_t * restrict q8 = y[i].qs;
10562
- const int16_t * restrict qs = y[i].bsums;
11619
+ const uint8_t * GGML_RESTRICT q1 = x[i].qs;
11620
+ const uint16_t * GGML_RESTRICT qh = x[i].qh;
11621
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11622
+ const int16_t * GGML_RESTRICT qs = y[i].bsums;
10563
11623
 
10564
11624
  for (int j = 0; j < QK_K/32; j += 2) {
10565
11625
  __builtin_prefetch(q1, 0, 1);
@@ -10720,7 +11780,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10720
11780
  #endif
10721
11781
  }
10722
11782
 
10723
- void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11783
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10724
11784
  assert(n % QK_K == 0);
10725
11785
  assert(nrc == 1);
10726
11786
  UNUSED(nrc);
@@ -10728,8 +11788,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10728
11788
  UNUSED(by);
10729
11789
  UNUSED(bs);
10730
11790
 
10731
- const block_iq1_m * restrict x = vx;
10732
- const block_q8_K * restrict y = vy;
11791
+ const block_iq1_m * GGML_RESTRICT x = vx;
11792
+ const block_q8_K * GGML_RESTRICT y = vy;
10733
11793
 
10734
11794
  const int nb = n / QK_K;
10735
11795
 
@@ -10809,6 +11869,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10809
11869
 
10810
11870
  const __m256i mask = _mm256_set1_epi16(0x7);
10811
11871
  const __m256i mone = _mm256_set1_epi16(1);
11872
+ const __m256i mone8 = _mm256_set1_epi8(1);
11873
+ const __m256i mtwo8 = _mm256_set1_epi8(2);
11874
+ // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
11875
+ const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
10812
11876
 
10813
11877
  __m256 accum1 = _mm256_setzero_ps();
10814
11878
  __m256 accum2 = _mm256_setzero_ps();
@@ -10820,10 +11884,33 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10820
11884
  const uint16_t * sc = (const uint16_t *)x[i].scales;
10821
11885
 
10822
11886
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
11887
+ // Extract 3-bit scales (16 values)
11888
+ __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
11889
+ scales = _mm256_srlv_epi64(scales, scales_shift);
11890
+ scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
11891
+
11892
+ // Indices to repeat each scale 8 times.
11893
+ __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
11894
+ __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
10823
11895
 
10824
11896
  __m256i sumi1 = _mm256_setzero_si256();
10825
11897
  __m256i sumi2 = _mm256_setzero_si256();
10826
11898
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11899
+ #ifdef __BMI2__
11900
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
11901
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
11902
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
11903
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
11904
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11905
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11906
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11907
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11908
+
11909
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
11910
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
11911
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
11912
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
11913
+ #else
10827
11914
  const __m256i q1b_1 = _mm256_set_epi64x(
10828
11915
  iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
10829
11916
  iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
@@ -10832,11 +11919,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10832
11919
  iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
10833
11920
  iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
10834
11921
  );
10835
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10836
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10837
-
10838
- const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
10839
- const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
10840
11922
 
10841
11923
  const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
10842
11924
  qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
@@ -10846,15 +11928,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10846
11928
  qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
10847
11929
  qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
10848
11930
  qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11931
+ #endif
11932
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11933
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11934
+
11935
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11936
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
11937
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
11938
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
10849
11939
 
10850
- const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
10851
- const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
11940
+ __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
11941
+ __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
10852
11942
 
10853
- __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
10854
- __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
11943
+ scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
11944
+ scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
10855
11945
 
10856
- scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
10857
- scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
10858
11946
  const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
10859
11947
  const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
10860
11948
  const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
@@ -11010,7 +12098,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
11010
12098
  #endif
11011
12099
  }
11012
12100
 
11013
- void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
12101
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11014
12102
  assert(nrc == 1);
11015
12103
  UNUSED(nrc);
11016
12104
  UNUSED(bx);
@@ -11019,8 +12107,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11019
12107
  assert(n % QK4_NL == 0);
11020
12108
  static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
11021
12109
 
11022
- const block_iq4_nl * restrict x = vx;
11023
- const block_q8_0 * restrict y = vy;
12110
+ const block_iq4_nl * GGML_RESTRICT x = vx;
12111
+ const block_q8_0 * GGML_RESTRICT y = vy;
11024
12112
 
11025
12113
  const int nb = n / QK4_NL;
11026
12114
 
@@ -11190,6 +12278,27 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11190
12278
 
11191
12279
  sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
11192
12280
 
12281
+ #elif defined(__VXE__) || defined(__VXE2__)
12282
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
12283
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
12284
+
12285
+ for (; ib < nb; ++ib) {
12286
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
12287
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
12288
+
12289
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
12290
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
12291
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
12292
+
12293
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
12294
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
12295
+
12296
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
12297
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
12298
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
12299
+
12300
+ sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
12301
+ }
11193
12302
  #endif
11194
12303
  for (; ib < nb; ++ib) {
11195
12304
  const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
@@ -11203,7 +12312,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11203
12312
  *s = sumf;
11204
12313
  }
11205
12314
 
11206
- void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
12315
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11207
12316
  assert(nrc == 1);
11208
12317
  UNUSED(nrc);
11209
12318
  UNUSED(bx);
@@ -11211,8 +12320,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11211
12320
  UNUSED(bs);
11212
12321
  assert(n % QK_K == 0);
11213
12322
 
11214
- const block_iq4_xs * restrict x = vx;
11215
- const block_q8_K * restrict y = vy;
12323
+ const block_iq4_xs * GGML_RESTRICT x = vx;
12324
+ const block_q8_K * GGML_RESTRICT y = vy;
11216
12325
 
11217
12326
  const int nb = n / QK_K;
11218
12327
 
@@ -11369,9 +12478,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11369
12478
 
11370
12479
  uint16_t h = x[ibl].scales_h;
11371
12480
 
11372
- const uint8_t * restrict q4 = x[ibl].qs;
11373
- const uint8_t * restrict sc = x[ibl].scales_l;
11374
- const int8_t * restrict q8 = y[ibl].qs;
12481
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12482
+ const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
12483
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
11375
12484
 
11376
12485
  for (int ib = 0; ib < QK_K/64; ib ++ ) {
11377
12486
  __builtin_prefetch(q4, 0, 1);
@@ -11468,6 +12577,56 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11468
12577
  }
11469
12578
 
11470
12579
  *s = hsum_float_8(accum);
12580
+ #elif defined(__VXE__) || defined(__VXE2__)
12581
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
12582
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
12583
+
12584
+ float sumf = 0;
12585
+
12586
+ for (int ibl = 0; ibl < nb; ++ibl) {
12587
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12588
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
12589
+
12590
+ uint16_t h = x[ibl].scales_h;
12591
+
12592
+ int sumi1 = 0, sumi2 = 0;
12593
+ for (int ib = 0; ib < QK_K/64; ++ib) {
12594
+ const uint8x16_t v_x0 = vec_xl(0 , q4);
12595
+ const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
12596
+ q4 += 32;
12597
+
12598
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
12599
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
12600
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
12601
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
12602
+
12603
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
12604
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
12605
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
12606
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
12607
+
12608
+ const int8x16_t v_y0 = vec_xl( 0, q8);
12609
+ const int8x16_t v_y1 = vec_xl(16, q8);
12610
+ const int8x16_t v_y2 = vec_xl(32, q8);
12611
+ const int8x16_t v_y3 = vec_xl(48, q8);
12612
+ q8 += 64;
12613
+
12614
+ int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
12615
+ int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
12616
+
12617
+ int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
12618
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
12619
+
12620
+ h >>= 4;
12621
+
12622
+ sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
12623
+ sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
12624
+ }
12625
+
12626
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
12627
+ }
12628
+
12629
+ *s = sumf;
11471
12630
 
11472
12631
  #else
11473
12632
  float sumf = 0;
@@ -11506,12 +12665,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11506
12665
 
11507
12666
  // ============================ 4-bit non-linear quants
11508
12667
 
11509
- void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
12668
+ void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
11510
12669
  assert(k % QK4_NL == 0);
11511
12670
  quantize_row_iq4_nl_ref(x, y, k);
11512
12671
  }
11513
12672
 
11514
- void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
12673
+ void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
11515
12674
  assert(k % QK_K == 0);
11516
12675
  quantize_iq4_xs(x, y, 1, k, NULL);
11517
12676
  }