@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -719,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
719
719
  }
720
720
  #endif //__loongarch_asx
721
721
 
722
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
722
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
723
723
  quantize_row_q4_0_ref(x, y, k);
724
724
  }
725
725
 
726
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
726
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
727
727
  quantize_row_q4_1_ref(x, y, k);
728
728
  }
729
729
 
730
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
730
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
731
731
  quantize_row_q5_0_ref(x, y, k);
732
732
  }
733
733
 
734
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
734
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
735
735
  quantize_row_q5_1_ref(x, y, k);
736
736
  }
737
737
 
738
- void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
738
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
739
739
  assert(QK8_0 == 32);
740
740
  assert(k % QK8_0 == 0);
741
741
  const int nb = k / QK8_0;
742
742
 
743
- block_q8_0 * restrict y = vy;
743
+ block_q8_0 * GGML_RESTRICT y = vy;
744
744
 
745
745
  #if defined(__ARM_NEON)
746
746
  for (int i = 0; i < nb; i++) {
@@ -1011,6 +1011,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1011
1011
  __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
1012
1012
 
1013
1013
  }
1014
+ #elif defined(__VXE__) || defined(__VXE2__)
1015
+ for (int i = 0; i < nb; i++) {
1016
+ __vector float srcv [8];
1017
+ __vector float asrcv[8];
1018
+ __vector float amaxv[8];
1019
+
1020
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
1021
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
1022
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
1023
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
1024
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
1025
+
1026
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
1027
+ vec_extract(amaxv[0], 1)),
1028
+ MAX(vec_extract(amaxv[0], 2),
1029
+ vec_extract(amaxv[0], 3)));
1030
+
1031
+ const float d = amax / ((1 << 7) - 1);
1032
+ const float id = d ? 1.0f / d : 0.0f;
1033
+
1034
+ y[i].d = GGML_FP32_TO_FP16(d);
1035
+
1036
+ for (int j = 0; j < 8; j++) {
1037
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
1038
+ const __vector int32_t vi = vec_signed(v);
1039
+
1040
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
1041
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
1042
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
1043
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
1044
+ }
1045
+ }
1014
1046
  #else
1015
1047
  GGML_UNUSED(nb);
1016
1048
  // scalar
@@ -1018,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1018
1050
  #endif
1019
1051
  }
1020
1052
 
1021
- void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
1053
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1022
1054
  assert(k % QK8_1 == 0);
1023
1055
  const int nb = k / QK8_1;
1024
1056
 
1025
- block_q8_1 * restrict y = vy;
1057
+ block_q8_1 * GGML_RESTRICT y = vy;
1026
1058
 
1027
1059
  #if defined(__ARM_NEON)
1028
1060
  for (int i = 0; i < nb; i++) {
@@ -1337,6 +1369,44 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1337
1369
  __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
1338
1370
  __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
1339
1371
  }
1372
+ #elif defined(__VXE__) || defined(__VXE2__)
1373
+ for (int i = 0; i < nb; i++) {
1374
+ __vector float srcv [8];
1375
+ __vector float asrcv[8];
1376
+ __vector float amaxv[8];
1377
+
1378
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
1379
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
1380
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
1381
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
1382
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
1383
+
1384
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
1385
+ vec_extract(amaxv[0], 1)),
1386
+ MAX(vec_extract(amaxv[0], 2),
1387
+ vec_extract(amaxv[0], 3)));
1388
+
1389
+ const float d = amax / ((1 << 7) - 1);
1390
+ const float id = d ? 1.0f / d : 0.0f;
1391
+
1392
+ y[i].d = GGML_FP32_TO_FP16(d);
1393
+
1394
+ __vector int32_t acc = vec_splats(0);
1395
+
1396
+ for (int j = 0; j < 8; j++) {
1397
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
1398
+ const __vector int32_t vi = vec_signed(v);
1399
+
1400
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
1401
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
1402
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
1403
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
1404
+
1405
+ acc = vec_add(acc, vi);
1406
+ }
1407
+
1408
+ y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
1409
+ }
1340
1410
  #else
1341
1411
  GGML_UNUSED(nb);
1342
1412
  // scalar
@@ -1358,8 +1428,8 @@ static inline int nearest_int(float fval) {
1358
1428
  return (i & 0x007fffff) - 0x00400000;
1359
1429
  }
1360
1430
 
1361
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1362
- const float * restrict qw) {
1431
+ static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
1432
+ const float * GGML_RESTRICT qw) {
1363
1433
  float max = 0;
1364
1434
  float amax = 0;
1365
1435
  for (int i = 0; i < n; ++i) {
@@ -1427,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1427
1497
  return scale;
1428
1498
  }
1429
1499
 
1430
- static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
1500
+ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
1431
1501
  float max = 0;
1432
1502
  float amax = 0;
1433
1503
  for (int i = 0; i < n; ++i) {
@@ -1486,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
1486
1556
  return 1/iscale;
1487
1557
  }
1488
1558
 
1489
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
1559
+ static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
1490
1560
  int ntry, float alpha) {
1491
1561
  float min = x[0];
1492
1562
  float max = x[0];
@@ -1529,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
1529
1599
  return scale;
1530
1600
  }
1531
1601
 
1532
- static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1533
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1602
+ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
1603
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
1534
1604
  float rmin, float rdelta, int nstep, bool use_mad) {
1535
1605
  float min = x[0];
1536
1606
  float max = x[0];
@@ -1610,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1610
1680
  return scale;
1611
1681
  }
1612
1682
 
1613
- static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
1683
+ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
1614
1684
  if (j < 4) {
1615
1685
  *d = q[j] & 63; *m = q[j + 4] & 63;
1616
1686
  } else {
@@ -1621,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1621
1691
 
1622
1692
  //========================- 2-bit (de)-quantization
1623
1693
 
1624
- void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
1694
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1625
1695
  quantize_row_q2_K_ref(x, vy, k);
1626
1696
  }
1627
1697
 
1628
1698
  //========================= 3-bit (de)-quantization
1629
1699
 
1630
- void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
1700
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1631
1701
  quantize_row_q3_K_ref(x, vy, k);
1632
1702
  }
1633
1703
 
1634
1704
  // ====================== 4-bit (de)-quantization
1635
1705
 
1636
- void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
1706
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1637
1707
  assert(k % QK_K == 0);
1638
- block_q4_K * restrict y = vy;
1708
+ block_q4_K * GGML_RESTRICT y = vy;
1639
1709
  quantize_row_q4_K_ref(x, y, k);
1640
1710
  }
1641
1711
 
1642
1712
  // ====================== 5-bit (de)-quantization
1643
1713
 
1644
- void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
1714
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1645
1715
  assert(k % QK_K == 0);
1646
- block_q5_K * restrict y = vy;
1716
+ block_q5_K * GGML_RESTRICT y = vy;
1647
1717
  quantize_row_q5_K_ref(x, y, k);
1648
1718
  }
1649
1719
 
1650
1720
  // ====================== 6-bit (de)-quantization
1651
1721
 
1652
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
1722
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1653
1723
  assert(k % QK_K == 0);
1654
- block_q6_K * restrict y = vy;
1724
+ block_q6_K * GGML_RESTRICT y = vy;
1655
1725
  quantize_row_q6_K_ref(x, y, k);
1656
1726
  }
1657
1727
 
1658
1728
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
1659
1729
 
1660
- void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
1730
+ void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1661
1731
  assert(k % QK_K == 0);
1662
- block_tq1_0 * restrict y = vy;
1732
+ block_tq1_0 * GGML_RESTRICT y = vy;
1663
1733
  quantize_row_tq1_0_ref(x, y, k);
1664
1734
  }
1665
1735
 
1666
- void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
1736
+ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
1667
1737
  assert(k % QK_K == 0);
1668
- block_tq2_0 * restrict y = vy;
1738
+ block_tq2_0 * GGML_RESTRICT y = vy;
1669
1739
  quantize_row_tq2_0_ref(x, y, k);
1670
1740
  }
1671
1741
 
@@ -1673,11 +1743,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
1673
1743
 
1674
1744
  //===================================== Q8_K ==============================================
1675
1745
 
1676
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
1746
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1677
1747
  #ifdef __wasm_simd128__
1678
1748
  assert(k % QK_K == 0);
1679
1749
  const int64_t nb = k / QK_K;
1680
- block_q8_K * restrict yc = y; // Cast to proper type
1750
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
1681
1751
 
1682
1752
  for (int i = 0; i < nb; i++) {
1683
1753
  const float * x_block = x + i * QK_K;
@@ -1839,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
1839
1909
  }
1840
1910
  #endif
1841
1911
 
1842
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
1912
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1843
1913
  const int qk = QK8_0;
1844
1914
  const int nb = n / qk;
1845
1915
 
@@ -1854,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1854
1924
  UNUSED(by);
1855
1925
  UNUSED(bs);
1856
1926
 
1857
- const block_q4_0 * restrict x = vx;
1858
- const block_q8_0 * restrict y = vy;
1927
+ const block_q4_0 * GGML_RESTRICT x = vx;
1928
+ const block_q8_0 * GGML_RESTRICT y = vy;
1859
1929
 
1860
1930
  #if defined(__ARM_FEATURE_MATMUL_INT8)
1861
1931
  if (nrc == 2) {
1862
- const block_q4_0 * restrict vx0 = vx;
1863
- const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1864
- const block_q8_0 * restrict vy0 = vy;
1865
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1932
+ const block_q4_0 * GGML_RESTRICT vx0 = vx;
1933
+ const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
1934
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
1935
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
1866
1936
 
1867
1937
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
1868
1938
 
1869
1939
  for (int i = 0; i < nb; i++) {
1870
- const block_q4_0 * restrict b_x0 = &vx0[i];
1871
- const block_q4_0 * restrict b_x1 = &vx1[i];
1872
- const block_q8_0 * restrict b_y0 = &vy0[i];
1873
- const block_q8_0 * restrict b_y1 = &vy1[i];
1940
+ const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
1941
+ const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
1942
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
1943
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
1874
1944
 
1875
1945
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
1876
1946
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -1947,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1947
2017
  const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
1948
2018
 
1949
2019
  for (; ib + 1 < nb; ib += 2) {
1950
- const block_q4_0 * restrict x0 = &x[ib + 0];
1951
- const block_q4_0 * restrict x1 = &x[ib + 1];
1952
- const block_q8_0 * restrict y0 = &y[ib + 0];
1953
- const block_q8_0 * restrict y1 = &y[ib + 1];
2020
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2021
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2022
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2023
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
1954
2024
 
1955
2025
  // load x
1956
2026
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -1993,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
1993
2063
  const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
1994
2064
 
1995
2065
  for (; ib + 1 < nb; ib += 2) {
1996
- const block_q4_0 * restrict x0 = &x[ib + 0];
1997
- const block_q4_0 * restrict x1 = &x[ib + 1];
1998
- const block_q8_0 * restrict y0 = &y[ib + 0];
1999
- const block_q8_0 * restrict y1 = &y[ib + 1];
2066
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2067
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2068
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2069
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2000
2070
 
2001
2071
  // load x
2002
2072
  const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -2034,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2034
2104
  const svbool_t pl16 = svnot_b_z(ph32, ph16);
2035
2105
 
2036
2106
  for (; ib + 1 < nb; ib += 2) {
2037
- const block_q4_0 * restrict x0 = &x[ib + 0];
2038
- const block_q4_0 * restrict x1 = &x[ib + 1];
2039
- const block_q8_0 * restrict y0 = &y[ib + 0];
2040
- const block_q8_0 * restrict y1 = &y[ib + 1];
2107
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2108
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2109
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2110
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2041
2111
 
2042
2112
  // load x
2043
2113
  const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
@@ -2074,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2074
2144
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
2075
2145
 
2076
2146
  for (; ib + 1 < nb; ib += 2) {
2077
- const block_q4_0 * restrict x0 = &x[ib + 0];
2078
- const block_q4_0 * restrict x1 = &x[ib + 1];
2079
- const block_q8_0 * restrict y0 = &y[ib + 0];
2080
- const block_q8_0 * restrict y1 = &y[ib + 1];
2147
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
2148
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2149
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
2150
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2081
2151
 
2082
2152
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2083
2153
  const int8x16_t s8b = vdupq_n_s8(0x8);
@@ -2119,10 +2189,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2119
2189
  const v128_t s8b = wasm_i8x16_splat(0x8);
2120
2190
 
2121
2191
  for (; ib + 1 < nb; ib += 2) {
2122
- const block_q4_0 * restrict x0 = &x[ib];
2123
- const block_q4_0 * restrict x1 = &x[ib + 1];
2124
- const block_q8_0 * restrict y0 = &y[ib];
2125
- const block_q8_0 * restrict y1 = &y[ib + 1];
2192
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
2193
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
2194
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2195
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2126
2196
 
2127
2197
  // Load and process x0
2128
2198
  v128_t v0_0 = wasm_v128_load(x0->qs);
@@ -2488,6 +2558,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2488
2558
  }
2489
2559
 
2490
2560
  sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
2561
+ #elif defined(__VXE__) || defined(__VXE2__)
2562
+ __vector float acc = vec_splats(0.0f);
2563
+
2564
+ const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
2565
+ const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
2566
+
2567
+ for (; ib < nb; ++ib) {
2568
+ const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
2569
+ const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
2570
+ const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
2571
+
2572
+ const __vector int8_t v_xls = vec_sub(v_xl, v_s);
2573
+ const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
2574
+
2575
+ const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
2576
+ const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
2577
+
2578
+ const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
2579
+ const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
2580
+ const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
2581
+ const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
2582
+
2583
+ __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
2584
+
2585
+ const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
2586
+ const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
2587
+
2588
+ acc = vec_madd(v_xy, v_d, acc);
2589
+ }
2590
+
2591
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
2491
2592
  #endif
2492
2593
  for (; ib < nb; ++ib) {
2493
2594
  int sumi0 = 0;
@@ -2508,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2508
2609
  *s = sumf;
2509
2610
  }
2510
2611
 
2511
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2612
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2512
2613
  const int qk = QK8_1;
2513
2614
  const int nb = n / qk;
2514
2615
 
@@ -2523,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2523
2624
  UNUSED(by);
2524
2625
  UNUSED(bs);
2525
2626
 
2526
- const block_q4_1 * restrict x = vx;
2527
- const block_q8_1 * restrict y = vy;
2627
+ const block_q4_1 * GGML_RESTRICT x = vx;
2628
+ const block_q8_1 * GGML_RESTRICT y = vy;
2528
2629
 
2529
2630
  #if defined(__ARM_FEATURE_MATMUL_INT8)
2530
2631
  if (nrc == 2) {
2531
- const block_q4_1 * restrict vx0 = vx;
2532
- const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2533
- const block_q8_1 * restrict vy0 = vy;
2534
- const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2632
+ const block_q4_1 * GGML_RESTRICT vx0 = vx;
2633
+ const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
2634
+ const block_q8_1 * GGML_RESTRICT vy0 = vy;
2635
+ const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
2535
2636
 
2536
2637
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
2537
2638
  float32x4_t summs0 = vdupq_n_f32(0.0f);
2538
2639
 
2539
2640
  for (int i = 0; i < nb; i++) {
2540
- const block_q4_1 * restrict b_x0 = &vx0[i];
2541
- const block_q4_1 * restrict b_x1 = &vx1[i];
2542
- const block_q8_1 * restrict b_y0 = &vy0[i];
2543
- const block_q8_1 * restrict b_y1 = &vy1[i];
2641
+ const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
2642
+ const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
2643
+ const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
2644
+ const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
2544
2645
 
2545
2646
  float32_t summs_t[4] = {
2546
2647
  GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
@@ -2614,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2614
2715
  float summs = 0;
2615
2716
 
2616
2717
  for (; ib + 1 < nb; ib += 2) {
2617
- const block_q4_1 * restrict x0 = &x[ib + 0];
2618
- const block_q4_1 * restrict x1 = &x[ib + 1];
2619
- const block_q8_1 * restrict y0 = &y[ib + 0];
2620
- const block_q8_1 * restrict y1 = &y[ib + 1];
2718
+ const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
2719
+ const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
2720
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
2721
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
2621
2722
 
2622
2723
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
2623
2724
 
@@ -2781,6 +2882,35 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2781
2882
  }
2782
2883
 
2783
2884
  sumf = hsum_float_8(acc) + summs;
2885
+ #elif defined(__VXE__) || defined(__VXE2__)
2886
+ float summs = 0;
2887
+ float32x4_t acc = vec_splats(0.0f);
2888
+
2889
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
2890
+
2891
+ #pragma GCC unroll 4
2892
+ for (; ib < nb; ++ib) {
2893
+ __builtin_prefetch(x[ib].qs, 0, 1);
2894
+ __builtin_prefetch(y[ib].qs, 0, 1);
2895
+
2896
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
2897
+
2898
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
2899
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
2900
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
2901
+
2902
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
2903
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
2904
+
2905
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
2906
+ const float32x4_t v_xy = vec_float(v_xy_);
2907
+
2908
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
2909
+
2910
+ acc = vec_madd(v_xy, v_d, acc);
2911
+ }
2912
+
2913
+ sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
2784
2914
  #endif
2785
2915
  for (; ib < nb; ++ib) {
2786
2916
  int sumi0 = 0;
@@ -2801,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
2801
2931
  *s = sumf;
2802
2932
  }
2803
2933
 
2804
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
2934
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2805
2935
  const int qk = QK8_0;
2806
2936
  const int nb = n / qk;
2807
2937
 
@@ -2816,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2816
2946
  UNUSED(by);
2817
2947
  UNUSED(bs);
2818
2948
 
2819
- const block_q5_0 * restrict x = vx;
2820
- const block_q8_0 * restrict y = vy;
2949
+ const block_q5_0 * GGML_RESTRICT x = vx;
2950
+ const block_q8_0 * GGML_RESTRICT y = vy;
2821
2951
 
2822
2952
  #if defined(__ARM_NEON)
2823
2953
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2830,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2830
2960
  uint64_t tmp1[4];
2831
2961
 
2832
2962
  for (; ib + 1 < nb; ib += 2) {
2833
- const block_q5_0 * restrict x0 = &x[ib];
2834
- const block_q5_0 * restrict x1 = &x[ib + 1];
2835
- const block_q8_0 * restrict y0 = &y[ib];
2836
- const block_q8_0 * restrict y1 = &y[ib + 1];
2963
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
2964
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
2965
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2966
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
2837
2967
 
2838
2968
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2839
2969
 
@@ -2894,8 +3024,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2894
3024
 
2895
3025
  // TODO: check if unrolling this is better
2896
3026
  for (; ib < nb; ++ib) {
2897
- const block_q5_0 * restrict x0 = &x[ib];
2898
- const block_q8_0 * restrict y0 = &y[ib];
3027
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
3028
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
2899
3029
 
2900
3030
  const v128_t m4b = wasm_i8x16_splat(0x0F);
2901
3031
 
@@ -3156,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3156
3286
  *s = sumf;
3157
3287
  }
3158
3288
 
3159
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3289
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3160
3290
  const int qk = QK8_1;
3161
3291
  const int nb = n / qk;
3162
3292
 
@@ -3171,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3171
3301
  UNUSED(by);
3172
3302
  UNUSED(bs);
3173
3303
 
3174
- const block_q5_1 * restrict x = vx;
3175
- const block_q8_1 * restrict y = vy;
3304
+ const block_q5_1 * GGML_RESTRICT x = vx;
3305
+ const block_q8_1 * GGML_RESTRICT y = vy;
3176
3306
 
3177
3307
  #if defined(__ARM_NEON)
3178
3308
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3188,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3188
3318
  uint64_t tmp1[4];
3189
3319
 
3190
3320
  for (; ib + 1 < nb; ib += 2) {
3191
- const block_q5_1 * restrict x0 = &x[ib];
3192
- const block_q5_1 * restrict x1 = &x[ib + 1];
3193
- const block_q8_1 * restrict y0 = &y[ib];
3194
- const block_q8_1 * restrict y1 = &y[ib + 1];
3321
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3322
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
3323
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3324
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
3195
3325
 
3196
3326
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3197
3327
 
@@ -3257,8 +3387,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3257
3387
 
3258
3388
  // TODO: check if unrolling this is better
3259
3389
  for (; ib < nb; ++ib) {
3260
- const block_q5_1 * restrict x0 = &x[ib];
3261
- const block_q8_1 * restrict y0 = &y[ib];
3390
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
3391
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
3262
3392
 
3263
3393
  summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
3264
3394
 
@@ -3530,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3530
3660
  *s = sumf;
3531
3661
  }
3532
3662
 
3533
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3663
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3534
3664
  const int qk = QK8_0;
3535
3665
  const int nb = n / qk;
3536
3666
 
@@ -3545,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3545
3675
  UNUSED(by);
3546
3676
  UNUSED(bs);
3547
3677
 
3548
- const block_q8_0 * restrict x = vx;
3549
- const block_q8_0 * restrict y = vy;
3678
+ const block_q8_0 * GGML_RESTRICT x = vx;
3679
+ const block_q8_0 * GGML_RESTRICT y = vy;
3550
3680
 
3551
3681
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3552
3682
  if (nrc == 2) {
3553
- const block_q8_0 * restrict vx0 = vx;
3554
- const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3555
- const block_q8_0 * restrict vy0 = vy;
3556
- const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3683
+ const block_q8_0 * GGML_RESTRICT vx0 = vx;
3684
+ const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
3685
+ const block_q8_0 * GGML_RESTRICT vy0 = vy;
3686
+ const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3557
3687
 
3558
3688
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3559
3689
 
3560
3690
  for (int i = 0; i < nb; i++) {
3561
- const block_q8_0 * restrict b_x0 = &vx0[i];
3562
- const block_q8_0 * restrict b_y0 = &vy0[i];
3691
+ const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
3692
+ const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
3563
3693
 
3564
- const block_q8_0 * restrict b_x1 = &vx1[i];
3565
- const block_q8_0 * restrict b_y1 = &vy1[i];
3694
+ const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
3695
+ const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
3566
3696
 
3567
3697
  const int8x16_t x0_l = vld1q_s8(b_x0->qs);
3568
3698
  const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
@@ -3627,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3627
3757
  const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
3628
3758
 
3629
3759
  for (; ib + 1 < nb; ib += 2) {
3630
- const block_q8_0 * restrict x0 = &x[ib + 0];
3631
- const block_q8_0 * restrict x1 = &x[ib + 1];
3632
- const block_q8_0 * restrict y0 = &y[ib + 0];
3633
- const block_q8_0 * restrict y1 = &y[ib + 1];
3760
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3761
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3762
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3763
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3634
3764
 
3635
3765
  // load x
3636
3766
  const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
@@ -3658,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3658
3788
  {
3659
3789
  //printf("sve256");
3660
3790
  for (; ib + 1 < nb; ib += 2) {
3661
- const block_q8_0 * restrict x0 = &x[ib + 0];
3662
- const block_q8_0 * restrict x1 = &x[ib + 1];
3663
- const block_q8_0 * restrict y0 = &y[ib + 0];
3664
- const block_q8_0 * restrict y1 = &y[ib + 1];
3791
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3792
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3793
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3794
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3665
3795
 
3666
3796
  // load x
3667
3797
  const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
@@ -3694,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3694
3824
  svfloat32_t sumv00 = svdup_n_f32(0.0f);
3695
3825
 
3696
3826
  for (; ib + 1 < nb; ib += 2) {
3697
- const block_q8_0 * restrict x0 = &x[ib + 0];
3698
- const block_q8_0 * restrict x1 = &x[ib + 1];
3699
- const block_q8_0 * restrict y0 = &y[ib + 0];
3700
- const block_q8_0 * restrict y1 = &y[ib + 1];
3827
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3828
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3829
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3830
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3701
3831
 
3702
3832
  //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
3703
3833
  // and add them to make one 64 element vector
@@ -3737,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3737
3867
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
3738
3868
 
3739
3869
  for (; ib + 1 < nb; ib += 2) {
3740
- const block_q8_0 * restrict x0 = &x[ib + 0];
3741
- const block_q8_0 * restrict x1 = &x[ib + 1];
3742
- const block_q8_0 * restrict y0 = &y[ib + 0];
3743
- const block_q8_0 * restrict y1 = &y[ib + 1];
3870
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
3871
+ const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
3872
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
3873
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
3744
3874
 
3745
3875
  const int8x16_t x0_0 = vld1q_s8(x0->qs);
3746
3876
  const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3767,8 +3897,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3767
3897
  v128_t sumv = wasm_f32x4_splat(0.0f);
3768
3898
 
3769
3899
  for (; ib < nb; ++ib) {
3770
- const block_q8_0 * restrict x0 = &x[ib];
3771
- const block_q8_0 * restrict y0 = &y[ib];
3900
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
3901
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
3772
3902
 
3773
3903
  const v128_t x0_0 = wasm_v128_load(x0->qs);
3774
3904
  const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
@@ -3915,6 +4045,27 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3915
4045
  }
3916
4046
 
3917
4047
  sumf = hsum_float_8(acc);
4048
+ #elif defined(__VXE__) || defined(__VXE2__)
4049
+ __vector float acc = vec_splats(0.0f);
4050
+
4051
+ #pragma GCC unroll 8
4052
+ for (; ib < nb; ++ib) {
4053
+ __builtin_prefetch(x[ib].qs, 0, 1);
4054
+ __builtin_prefetch(y[ib].qs, 0, 1);
4055
+
4056
+ const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
4057
+ const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
4058
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
4059
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
4060
+
4061
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
4062
+ const float32x4_t v_xy = vec_float(v_xy_);
4063
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
4064
+
4065
+ acc = vec_madd(v_xy, v_d, acc);
4066
+ }
4067
+
4068
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
3918
4069
  #endif
3919
4070
  for (; ib < nb; ++ib) {
3920
4071
  int sumi = 0;
@@ -3929,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3929
4080
  *s = sumf;
3930
4081
  }
3931
4082
 
3932
- void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4083
+ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3933
4084
  assert(nrc == 1);
3934
4085
  UNUSED(nrc);
3935
4086
  UNUSED(bx);
3936
4087
  UNUSED(by);
3937
4088
  UNUSED(bs);
3938
4089
 
3939
- const block_tq1_0 * restrict x = vx;
3940
- const block_q8_K * restrict y = vy;
4090
+ const block_tq1_0 * GGML_RESTRICT x = vx;
4091
+ const block_q8_K * GGML_RESTRICT y = vy;
3941
4092
 
3942
4093
  const int nb = n / QK_K;
3943
4094
 
@@ -4252,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
4252
4403
  #endif
4253
4404
  }
4254
4405
 
4255
- void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4406
+ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4256
4407
  assert(nrc == 1);
4257
4408
  UNUSED(nrc);
4258
4409
  UNUSED(bx);
4259
4410
  UNUSED(by);
4260
4411
  UNUSED(bs);
4261
4412
 
4262
- const block_tq2_0 * restrict x = vx;
4263
- const block_q8_K * restrict y = vy;
4413
+ const block_tq2_0 * GGML_RESTRICT x = vx;
4414
+ const block_q8_K * GGML_RESTRICT y = vy;
4264
4415
 
4265
4416
  const int nb = n / QK_K;
4266
4417
 
@@ -4424,19 +4575,264 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
4424
4575
  #endif
4425
4576
  }
4426
4577
 
4427
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4578
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4428
4579
  assert(nrc == 1);
4429
4580
  UNUSED(nrc);
4430
4581
  UNUSED(bx);
4431
4582
  UNUSED(by);
4432
4583
  UNUSED(bs);
4433
4584
 
4434
- const block_q2_K * restrict x = vx;
4435
- const block_q8_K * restrict y = vy;
4585
+ const block_q2_K * GGML_RESTRICT x = vx;
4586
+ const block_q8_K * GGML_RESTRICT y = vy;
4436
4587
 
4437
4588
  const int nb = n / QK_K;
4438
4589
 
4439
- #ifdef __ARM_NEON
4590
+ #ifdef __ARM_FEATURE_SVE
4591
+ const int vector_length = svcntb()*8;
4592
+ const svuint8_t m3s = svdup_n_u8(0x3);
4593
+ const svuint32_t m4s = svdup_n_u32(0xF);
4594
+ const svint32_t vzero_sv = svdup_n_s32(0);
4595
+ svfloat32_t acc_sum = svdup_n_f32(0);
4596
+ svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
4597
+
4598
+ switch (vector_length) {
4599
+ case 128:
4600
+ for (int i = 0; i < nb; ++i) {
4601
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4602
+ svfloat32_t d_broad = svdup_n_f32((float32_t)d);
4603
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4604
+ svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4605
+
4606
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4607
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4608
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4609
+
4610
+ svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
4611
+ const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4612
+
4613
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
4614
+ const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4615
+
4616
+ svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
4617
+ svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
4618
+
4619
+ const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
4620
+
4621
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
4622
+ const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4623
+
4624
+ mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
4625
+ const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
4626
+
4627
+ q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
4628
+ q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
4629
+
4630
+ svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
4631
+
4632
+ svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
4633
+
4634
+ acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
4635
+
4636
+ svint32_t sumi1 = svdup_n_s32(0);
4637
+
4638
+ {
4639
+ const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
4640
+ svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
4641
+ svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4642
+ const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
4643
+
4644
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
4645
+
4646
+ const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
4647
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
4648
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4649
+
4650
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
4651
+
4652
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
4653
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4654
+
4655
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
4656
+
4657
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
4658
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4659
+
4660
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
4661
+
4662
+
4663
+ const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
4664
+
4665
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
4666
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4667
+
4668
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
4669
+
4670
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
4671
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4672
+
4673
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
4674
+
4675
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
4676
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4677
+
4678
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
4679
+
4680
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
4681
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4682
+
4683
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
4684
+
4685
+ //-------------------------------
4686
+
4687
+ q2 += 32;
4688
+ const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
4689
+ const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
4690
+
4691
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
4692
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4693
+
4694
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
4695
+
4696
+ const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
4697
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
4698
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4699
+
4700
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
4701
+
4702
+
4703
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
4704
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4705
+
4706
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
4707
+
4708
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
4709
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4710
+
4711
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
4712
+
4713
+
4714
+ const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
4715
+
4716
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
4717
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4718
+
4719
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
4720
+
4721
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
4722
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4723
+
4724
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
4725
+
4726
+
4727
+
4728
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
4729
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4730
+
4731
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
4732
+
4733
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
4734
+ q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
4735
+
4736
+ sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
4737
+ }
4738
+ acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
4739
+ }
4740
+ *s = svaddv_f32(svptrue_b32(), acc_sum);
4741
+ break;
4742
+
4743
+ case 256:
4744
+ case 512:
4745
+ for (int i = 0; i < nb; ++i) {
4746
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4747
+ svfloat32_t d_broad = svdup_n_f32((float32_t)d);
4748
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4749
+ svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
4750
+
4751
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4752
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
4753
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4754
+
4755
+ const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
4756
+ const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
4757
+ const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
4758
+ svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
4759
+
4760
+ const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
4761
+ const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
4762
+ const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
4763
+
4764
+ svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
4765
+
4766
+ svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
4767
+
4768
+ acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
4769
+
4770
+ svint32_t sumi1 = svdup_n_s32(0);
4771
+
4772
+ {
4773
+ const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
4774
+ svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
4775
+ svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4776
+
4777
+ svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
4778
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4779
+
4780
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
4781
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4782
+
4783
+ svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
4784
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
4785
+
4786
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
4787
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4788
+
4789
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
4790
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4791
+
4792
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
4793
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4794
+
4795
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
4796
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4797
+
4798
+ q2 += 32;
4799
+
4800
+ const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
4801
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
4802
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4803
+
4804
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
4805
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4806
+
4807
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
4808
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4809
+
4810
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
4811
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4812
+
4813
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
4814
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4815
+
4816
+ scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
4817
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
4818
+
4819
+ q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
4820
+ q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
4821
+
4822
+ scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
4823
+ sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4824
+ }
4825
+ acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
4826
+ }
4827
+ *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
4828
+ break;
4829
+
4830
+ default:
4831
+ assert(false && "Unsupported vector length");
4832
+ break;
4833
+ }
4834
+
4835
+ #elif __ARM_NEON
4440
4836
  const uint8x16_t m3 = vdupq_n_u8(0x3);
4441
4837
  const uint8x16_t m4 = vdupq_n_u8(0xF);
4442
4838
 
@@ -4451,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4451
4847
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4452
4848
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4453
4849
 
4454
- const uint8_t * restrict q2 = x[i].qs;
4455
- const int8_t * restrict q8 = y[i].qs;
4456
- const uint8_t * restrict sc = x[i].scales;
4850
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4851
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4852
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
4457
4853
 
4458
4854
  const uint8x16_t mins_and_scales = vld1q_u8(sc);
4459
4855
  const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -4516,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4516
4912
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4517
4913
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4518
4914
 
4519
- const uint8_t * restrict q2 = x[i].qs;
4520
- const int8_t * restrict q8 = y[i].qs;
4915
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4916
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4521
4917
 
4522
4918
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
4523
4919
  const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -4583,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4583
4979
  const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
4584
4980
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
4585
4981
 
4586
- const uint8_t * restrict q2 = x[i].qs;
4587
- const int8_t * restrict q8 = y[i].qs;
4982
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
4983
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4588
4984
 
4589
4985
  // load mins and scales from block_q2_K.scales[QK_K/16]
4590
4986
  const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -4910,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
4910
5306
  vector signed int vsumi6 = v0;
4911
5307
  vector signed int vsumi7 = v0;
4912
5308
 
4913
- const uint8_t * restrict q2 = x[i].qs;
4914
- const int8_t * restrict q8 = y[i].qs;
5309
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5310
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4915
5311
 
4916
5312
  for (int j = 0; j < QK_K/128; ++j) {
4917
5313
  __builtin_prefetch(q2, 0, 1);
@@ -5002,8 +5398,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5002
5398
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5003
5399
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5004
5400
 
5005
- const uint8_t * restrict q2 = x[i].qs;
5006
- const int8_t * restrict q8 = y[i].qs;
5401
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
5402
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5007
5403
 
5008
5404
  const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
5009
5405
  const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
@@ -5096,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5096
5492
  #endif
5097
5493
  }
5098
5494
 
5099
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5495
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
5100
5496
  assert(n % QK_K == 0);
5101
5497
  assert(nrc == 1);
5102
5498
  UNUSED(nrc);
@@ -5107,12 +5503,187 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5107
5503
  const uint32_t kmask1 = 0x03030303;
5108
5504
  const uint32_t kmask2 = 0x0f0f0f0f;
5109
5505
 
5110
- const block_q3_K * restrict x = vx;
5111
- const block_q8_K * restrict y = vy;
5506
+ const block_q3_K * GGML_RESTRICT x = vx;
5507
+ const block_q8_K * GGML_RESTRICT y = vy;
5112
5508
 
5113
5509
  const int nb = n / QK_K;
5114
5510
 
5115
- #ifdef __ARM_NEON
5511
+ #if defined(__ARM_FEATURE_SVE)
5512
+
5513
+ uint32_t aux[3];
5514
+ uint32_t utmp[4];
5515
+
5516
+ const int8_t m32 = 32;
5517
+ const int vector_length = svcntb()*8;
5518
+ const svuint8_t m3b_sv = svdup_n_u8(0x3);
5519
+ const svint32_t vzero_sv = svdup_n_s32(0);
5520
+
5521
+ const svuint8_t m0_sv = svdup_n_u8(1);
5522
+ const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
5523
+ const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
5524
+ const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
5525
+
5526
+ float sum = 0;
5527
+
5528
+ for (int i = 0; i < nb; ++i) {
5529
+
5530
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5531
+
5532
+ const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
5533
+ const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
5534
+ const int8_t * GGML_RESTRICT q8_sv = y[i].qs;
5535
+
5536
+ // Set up scales
5537
+ memcpy(aux, x[i].scales, 12);
5538
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
5539
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
5540
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
5541
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
5542
+
5543
+ int8_t * scale = (int8_t *)utmp;
5544
+
5545
+ for (int j = 0; j < 16; ++j) scale[j] -= m32;
5546
+
5547
+ switch (vector_length) {
5548
+ case 128:
5549
+ {
5550
+ svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
5551
+ svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
5552
+ svuint8_t q3h_sv;
5553
+
5554
+ svint32_t sumi1_1 = svdup_n_s32(0);
5555
+ svint8_t q3bytes_sv;
5556
+
5557
+ for (int j = 0; j < QK_K/128; ++j) {
5558
+
5559
+ const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5560
+ const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5561
+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5562
+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5563
+
5564
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
5565
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5566
+
5567
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
5568
+
5569
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
5570
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5571
+
5572
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
5573
+
5574
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5575
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5576
+
5577
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
5578
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5579
+
5580
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
5581
+
5582
+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
5583
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5584
+
5585
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
5586
+
5587
+
5588
+ scale += 4;
5589
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5590
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5591
+
5592
+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
5593
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5594
+
5595
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
5596
+
5597
+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
5598
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5599
+
5600
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
5601
+
5602
+
5603
+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5604
+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5605
+
5606
+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
5607
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5608
+
5609
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
5610
+
5611
+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
5612
+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5613
+
5614
+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
5615
+
5616
+ if (j == 0) {
5617
+ qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
5618
+ qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
5619
+ }
5620
+
5621
+ scale += 4;
5622
+ }
5623
+
5624
+ sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
5625
+ } break;
5626
+ case 256:
5627
+ case 512:
5628
+ {
5629
+ svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
5630
+ svuint8_t q3h_sv;
5631
+
5632
+ svint32_t sumi1_1 = svdup_n_s32(0);
5633
+ svint8_t q3bytes_sv;
5634
+
5635
+ for (int j = 0; j < QK_K/128; ++j) {
5636
+
5637
+ const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
5638
+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5639
+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5640
+
5641
+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
5642
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5643
+
5644
+
5645
+ svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5646
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
5647
+
5648
+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
5649
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5650
+
5651
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5652
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
5653
+
5654
+ scale += 4;
5655
+ q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5656
+ q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5657
+
5658
+ q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
5659
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5660
+
5661
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5662
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
5663
+
5664
+ q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
5665
+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5666
+
5667
+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5668
+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
5669
+
5670
+ if (j == 0) {
5671
+ qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
5672
+ }
5673
+
5674
+ scale += 4;
5675
+ }
5676
+
5677
+ sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
5678
+ } break;
5679
+ default:
5680
+ assert(false && "Unsupported vector length");
5681
+ break;
5682
+ }
5683
+ }
5684
+ *s = sum;
5685
+
5686
+ #elif __ARM_NEON
5116
5687
 
5117
5688
  uint32_t aux[3];
5118
5689
  uint32_t utmp[4];
@@ -5134,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5134
5705
 
5135
5706
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5136
5707
 
5137
- const uint8_t * restrict q3 = x[i].qs;
5138
- const uint8_t * restrict qh = x[i].hmask;
5139
- const int8_t * restrict q8 = y[i].qs;
5708
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5709
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
5710
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5140
5711
 
5141
5712
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
5142
5713
 
@@ -5220,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5220
5791
 
5221
5792
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5222
5793
 
5223
- const uint8_t * restrict q3 = x[i].qs;
5224
- const int8_t * restrict q8 = y[i].qs;
5794
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5225
5796
 
5226
5797
  // Set up scales
5227
5798
  memcpy(aux, x[i].scales, 12);
@@ -5325,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5325
5896
 
5326
5897
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5327
5898
 
5328
- const uint8_t * restrict q3 = x[i].qs;
5329
- const int8_t * restrict q8 = y[i].qs;
5899
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
5900
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5330
5901
 
5331
5902
  // Set up scales
5332
5903
  aux = (const uint32_t *)x[i].scales;
@@ -5459,9 +6030,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5459
6030
 
5460
6031
  float sumf = 0;
5461
6032
  for (int i = 0; i < nb; ++i) {
5462
- const uint8_t * restrict q3 = x[i].qs;
5463
- const uint8_t * restrict hm = x[i].hmask;
5464
- const int8_t * restrict q8 = y[i].qs;
6033
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6034
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6035
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5465
6036
 
5466
6037
  // Process blocks with SIMD
5467
6038
  int8_t * a = aux8;
@@ -5548,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5548
6119
  float sumf = 0;
5549
6120
  for (int i = 0; i < nb; ++i) {
5550
6121
 
5551
- const uint8_t * restrict q3 = x[i].qs;
5552
- const uint8_t * restrict qh = x[i].hmask;
5553
- const int8_t * restrict q8 = y[i].qs;
6122
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6123
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
6124
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5554
6125
 
5555
6126
  memcpy(aux, x[i].scales, 12);
5556
6127
  utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -5690,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5690
6261
  vector signed int vsumi6 = v0;
5691
6262
  vector signed int vsumi7 = v0;
5692
6263
 
5693
- const uint8_t * restrict q3 = x[i].qs;
5694
- const int8_t * restrict q8 = y[i].qs;
6264
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6265
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5695
6266
 
5696
6267
  for (int j = 0; j < QK_K/128; ++j) {
5697
6268
  __builtin_prefetch(q3, 0, 1);
@@ -5804,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5804
6375
  for (int i = 0; i < nb; ++i) {
5805
6376
 
5806
6377
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5807
- const uint8_t * restrict q3 = x[i].qs;
5808
- const int8_t * restrict q8 = y[i].qs;
6378
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5809
6380
  // Set up scales
5810
6381
  memcpy(aux, x[i].scales, 12);
5811
6382
  __m128i scales128 = lsx_set_w(
@@ -5890,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5890
6461
 
5891
6462
  float sumf = 0;
5892
6463
  for (int i = 0; i < nb; ++i) {
5893
- const uint8_t * restrict q3 = x[i].qs;
5894
- const uint8_t * restrict hm = x[i].hmask;
5895
- const int8_t * restrict q8 = y[i].qs;
6464
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
6465
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
6466
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5896
6467
  memset(aux32, 0, 8*sizeof(int32_t));
5897
- int8_t * restrict a = aux8;
6468
+ int8_t * GGML_RESTRICT a = aux8;
5898
6469
  uint8_t m = 1;
5899
6470
  for (int j = 0; j < QK_K; j += 128) {
5900
6471
  for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -5937,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5937
6508
 
5938
6509
  }
5939
6510
 
5940
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6511
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
5941
6512
  assert(n % QK_K == 0);
5942
6513
  assert(nrc == 1);
5943
6514
  UNUSED(nrc);
@@ -5945,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5945
6516
  UNUSED(by);
5946
6517
  UNUSED(bs);
5947
6518
 
5948
- const block_q4_K * restrict x = vx;
5949
- const block_q8_K * restrict y = vy;
6519
+ const block_q4_K * GGML_RESTRICT x = vx;
6520
+ const block_q8_K * GGML_RESTRICT y = vy;
5950
6521
 
5951
6522
  const int nb = n / QK_K;
5952
6523
 
@@ -5981,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5981
6552
 
5982
6553
  const uint8_t * scales = (const uint8_t *)utmp;
5983
6554
 
5984
- const uint8_t * restrict q4 = x[i].qs;
5985
- const int8_t * restrict q8 = y[i].qs;
6555
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6556
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
5986
6557
 
5987
6558
  const int vector_length = ggml_cpu_get_sve_cnt()*8;
5988
6559
  const svuint8_t m4b = svdup_n_u8(0xf);
@@ -6069,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6069
6640
 
6070
6641
  const uint8_t * scales = (const uint8_t *)utmp;
6071
6642
 
6072
- const uint8_t * restrict q4 = x[i].qs;
6073
- const int8_t * restrict q8 = y[i].qs;
6643
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6644
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6074
6645
 
6075
6646
  int32_t sumi1 = 0;
6076
6647
  int32_t sumi2 = 0;
@@ -6108,8 +6679,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6108
6679
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6109
6680
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
6110
6681
 
6111
- const uint8_t * restrict q4 = x[i].qs;
6112
- const int8_t * restrict q8 = y[i].qs;
6682
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6113
6684
 
6114
6685
  // Process scales and mins
6115
6686
  memcpy(utmp, x[i].scales, 12);
@@ -6121,7 +6692,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6121
6692
 
6122
6693
  // Sum mins * q8sums
6123
6694
  int32_t sumi = 0;
6124
- const int16_t * restrict q8sums = y[i].bsums;
6695
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
6125
6696
  const uint8_t * m = (const uint8_t *)&utmp[2];
6126
6697
  for (int j = 0; j < 16; j += 2) {
6127
6698
  sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -6220,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6220
6791
  utmp[2] = uaux;
6221
6792
  utmp[0] &= kmask1;
6222
6793
 
6223
- const uint8_t * restrict q4 = x[i].qs;
6224
- const int8_t * restrict q8 = y[i].qs;
6794
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6795
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6225
6796
 
6226
6797
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
6227
6798
 
@@ -6279,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6279
6850
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6280
6851
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6281
6852
 
6282
- const uint8_t * restrict q4 = x[i].qs;
6283
- const int8_t * restrict q8 = y[i].qs;
6853
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6854
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6284
6855
 
6285
6856
  memcpy(utmp, x[i].scales, 12);
6286
6857
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6380,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6380
6951
  vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
6381
6952
  sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
6382
6953
 
6383
- const uint8_t * restrict q4 = x[i].qs;
6384
- const int8_t * restrict q8 = y[i].qs;
6954
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
6955
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6385
6956
 
6386
6957
  vl = 32;
6387
6958
 
@@ -6482,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6482
7053
  vector signed int vsumi2 = v0;
6483
7054
  vector signed int vsumi3 = v0;
6484
7055
 
6485
- const uint8_t * restrict q4 = x[i].qs;
6486
- const int8_t * restrict q8 = y[i].qs;
7056
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7057
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6487
7058
 
6488
7059
  for (int j = 0; j < QK_K/64; j+=2) {
6489
7060
  __builtin_prefetch(q4, 0, 1);
@@ -6574,8 +7145,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6574
7145
  utmp[2] = uaux;
6575
7146
  utmp[0] &= kmask1;
6576
7147
 
6577
- const uint8_t * restrict q4 = x[i].qs;
6578
- const int8_t * restrict q8 = y[i].qs;
7148
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7149
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6579
7150
 
6580
7151
  const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
6581
7152
  const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
@@ -6622,6 +7193,77 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6622
7193
 
6623
7194
 
6624
7195
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
7196
+ #elif defined(__VXE__) || defined(__VXE2__)
7197
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
7198
+ const int32x4_t v_z = vec_splat_s32(0);
7199
+
7200
+ uint8x16_t v_x[2];
7201
+ int8x16_t v_xl[2];
7202
+ int8x16_t v_y[2];
7203
+
7204
+ float sumf = 0;
7205
+
7206
+ for (int i = 0; i < nb; ++i) {
7207
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7208
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7209
+
7210
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
7211
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
7212
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
7213
+
7214
+ memcpy(utmp, x[i].scales, 12);
7215
+
7216
+ uint32x4_t v_mins8 = { 0 };
7217
+ v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
7218
+ v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
7219
+
7220
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7221
+ utmp[0] &= kmask1;
7222
+
7223
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
7224
+
7225
+ const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
7226
+ const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
7227
+ const int32x4_t v_mins = v_minso + v_minse;
7228
+ sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
7229
+
7230
+ const uint8_t * scales = (const uint8_t *)utmp;
7231
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
7232
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
7233
+
7234
+ int32_t sumi1 = 0;
7235
+ int32_t sumi2 = 0;
7236
+
7237
+ for (int j = 0; j < QK_K/64; ++j) {
7238
+ v_x[0] = vec_xl(0 , x0);
7239
+ v_x[1] = vec_xl(16, x0);
7240
+ x0 += 32;
7241
+
7242
+ v_y[0] = vec_xl(0 , y0);
7243
+ v_y[1] = vec_xl(16, y0);
7244
+ y0 += 32;
7245
+
7246
+ v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
7247
+ v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
7248
+
7249
+ const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
7250
+ sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
7251
+
7252
+ v_y[0] = vec_xl(0 , y0);
7253
+ v_y[1] = vec_xl(16, y0);
7254
+ y0 += 32;
7255
+
7256
+ v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
7257
+ v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
7258
+
7259
+ const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
7260
+ sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
7261
+ }
7262
+
7263
+ sumf += d * (sumi1 + sumi2);
7264
+ }
7265
+
7266
+ *s = sumf;
6625
7267
  #else
6626
7268
 
6627
7269
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -6635,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6635
7277
 
6636
7278
  float sumf = 0;
6637
7279
  for (int i = 0; i < nb; ++i) {
6638
- const uint8_t * restrict q4 = x[i].qs;
6639
- const int8_t * restrict q8 = y[i].qs;
7280
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
7281
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6640
7282
  memset(aux32, 0, 8*sizeof(int32_t));
6641
- int8_t * restrict a = aux8;
7283
+ int8_t * GGML_RESTRICT a = aux8;
6642
7284
  for (int j = 0; j < QK_K/64; ++j) {
6643
7285
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
6644
7286
  a += 32;
@@ -6681,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6681
7323
  #endif
6682
7324
  }
6683
7325
 
6684
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7326
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
6685
7327
  assert(n % QK_K == 0);
6686
7328
  assert(nrc == 1);
6687
7329
  UNUSED(nrc);
@@ -6689,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6689
7331
  UNUSED(by);
6690
7332
  UNUSED(bs);
6691
7333
 
6692
- const block_q5_K * restrict x = vx;
6693
- const block_q8_K * restrict y = vy;
7334
+ const block_q5_K * GGML_RESTRICT x = vx;
7335
+ const block_q8_K * GGML_RESTRICT y = vy;
6694
7336
 
6695
7337
  const int nb = n / QK_K;
6696
7338
 
@@ -6732,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6732
7374
 
6733
7375
  const uint8_t * scales = (const uint8_t *)utmp;
6734
7376
 
6735
- const uint8_t * restrict q5 = x[i].qs;
6736
- const uint8_t * restrict qh = x[i].qh;
6737
- const int8_t * restrict q8 = y[i].qs;
7377
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7378
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7379
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6738
7380
 
6739
7381
  ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
6740
7382
 
@@ -6779,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6779
7421
  float summs = 0.f;
6780
7422
 
6781
7423
  for (int i = 0; i < nb; ++i) {
6782
- const uint8_t * restrict q5 = x[i].qs;
6783
- const int8_t * restrict q8 = y[i].qs;
7424
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7425
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6784
7426
 
6785
7427
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6786
7428
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -6863,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6863
7505
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6864
7506
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
6865
7507
 
6866
- const uint8_t * restrict q5 = x[i].qs;
6867
- const int8_t * restrict q8 = y[i].qs;
7508
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7509
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6868
7510
 
6869
7511
  memcpy(utmp, x[i].scales, 12);
6870
7512
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6955,9 +7597,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6955
7597
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6956
7598
  const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
6957
7599
 
6958
- const uint8_t * restrict q5 = x[i].qs;
6959
- const uint8_t * restrict qh = x[i].qh;
6960
- const int8_t * restrict q8 = y[i].qs;
7600
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7601
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7602
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
6961
7603
 
6962
7604
  // Process scales and mins
6963
7605
  memcpy(utmp, x[i].scales, 12);
@@ -6969,7 +7611,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6969
7611
 
6970
7612
  // Sum mins * q8sums
6971
7613
  int32_t sumi_mins = 0;
6972
- const int16_t * restrict q8sums = y[i].bsums;
7614
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
6973
7615
  const uint8_t * m = (const uint8_t *)&utmp[2];
6974
7616
  for (int j = 0; j < 16; j += 2) {
6975
7617
  sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -7073,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7073
7715
 
7074
7716
  vl = 8;
7075
7717
 
7076
- const uint8_t * restrict q5 = x[i].qs;
7077
- const uint8_t * restrict hm = x[i].qh;
7078
- const int8_t * restrict q8 = y[i].qs;
7718
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7719
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
7720
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7079
7721
 
7080
7722
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7081
7723
  const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -7214,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7214
7856
  vector signed int vsumi2 = v0;
7215
7857
  vector signed int vsumi3 = v0;
7216
7858
 
7217
- const uint8_t * restrict q5 = x[i].qs;
7218
- const int8_t * restrict q8 = y[i].qs;
7859
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7860
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7219
7861
 
7220
7862
  for (int j = 0; j < QK_K/64; ++j) {
7221
7863
  __builtin_prefetch(q5, 0, 1);
@@ -7287,8 +7929,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7287
7929
 
7288
7930
  for (int i = 0; i < nb; ++i) {
7289
7931
 
7290
- const uint8_t * restrict q5 = x[i].qs;
7291
- const int8_t * restrict q8 = y[i].qs;
7932
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
7933
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7292
7934
 
7293
7935
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7294
7936
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -7351,7 +7993,94 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7351
7993
  acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
7352
7994
 
7353
7995
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
7996
+ #elif defined(__VXE__) || defined(__VXE2__)
7997
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
7998
+ const uint8x16_t v_1m = vec_splat_u8(0x01);
7999
+ const uint8x16_t v_2m = vec_splat_u8(0x02);
8000
+
8001
+ const int32x4_t v_z = vec_splat_s32(0);
8002
+
8003
+ const uchar8x16_t v_minsm = {
8004
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
8005
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
8006
+ };
8007
+
8008
+ int8x16_t q5b[4];
8009
+ uint8x16_t q5h[4];
8010
+
8011
+ uint8x16_t v_xl[2];
8012
+ uint8x16_t v_xh[2];
8013
+ int8x16_t v_y[4];
8014
+
8015
+ float sumf = 0;
8016
+
8017
+ for (int i = 0; i < nb; ++i) {
8018
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8019
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
8020
+
8021
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8022
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
8023
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
8024
+
8025
+ memcpy(utmp, x[i].scales, 12);
8026
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
8027
+ const uint32_t uaux = utmp[1] & kmask1;
8028
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
8029
+ utmp[2] = uaux;
8030
+ utmp[0] &= kmask1;
8031
+
8032
+ const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
8033
+ const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
8034
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
8035
+
8036
+ const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
8037
+ const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
8038
+ const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
8039
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8040
+
8041
+ const uint8_t * scales = (const uint8_t *)utmp;
8042
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
8043
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8044
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8045
+
8046
+ v_xh[0] = vec_xl(0 , x0h);
8047
+ v_xh[1] = vec_xl(16, x0h);
8048
+
8049
+ int32_t sumi = 0;
8050
+ for (int j = 0; j < QK_K/64; ++j) {
8051
+ v_xl[0] = vec_xl(0 , x0l);
8052
+ v_xl[1] = vec_xl(16, x0l);
8053
+ x0l += 32;
8054
+
8055
+ v_y[0] = vec_xl(0 , y0);
8056
+ v_y[1] = vec_xl(16, y0);
8057
+ v_y[2] = vec_xl(32, y0);
8058
+ v_y[3] = vec_xl(48, y0);
8059
+ y0 += 64;
8060
+
8061
+ q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
8062
+ q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
8063
+ q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
8064
+ q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
8065
+ v_xh[0] = vec_sr(v_xh[0], 2);
8066
+ v_xh[1] = vec_sr(v_xh[1], 2);
7354
8067
 
8068
+ q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
8069
+ q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
8070
+ q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
8071
+ q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
8072
+
8073
+ int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
8074
+ int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
8075
+
8076
+ sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
8077
+ sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
8078
+ }
8079
+
8080
+ sumf += d * sumi - dmin * mins;
8081
+ }
8082
+
8083
+ *s = sumf;
7355
8084
  #else
7356
8085
 
7357
8086
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -7365,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7365
8094
 
7366
8095
  float sumf = 0;
7367
8096
  for (int i = 0; i < nb; ++i) {
7368
- const uint8_t * restrict q4 = x[i].qs;
7369
- const uint8_t * restrict hm = x[i].qh;
7370
- const int8_t * restrict q8 = y[i].qs;
8097
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
8098
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
8099
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7371
8100
  memset(aux32, 0, 8*sizeof(int32_t));
7372
- int8_t * restrict a = aux8;
8101
+ int8_t * GGML_RESTRICT a = aux8;
7373
8102
  uint8_t m = 1;
7374
8103
  for (int j = 0; j < QK_K/64; ++j) {
7375
8104
  for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -7416,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7416
8145
  #endif
7417
8146
  }
7418
8147
 
7419
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8148
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
7420
8149
  assert(n % QK_K == 0);
7421
8150
  assert(nrc == 1);
7422
8151
  UNUSED(nrc);
@@ -7424,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7424
8153
  UNUSED(by);
7425
8154
  UNUSED(bs);
7426
8155
 
7427
- const block_q6_K * restrict x = vx;
7428
- const block_q8_K * restrict y = vy;
8156
+ const block_q6_K * GGML_RESTRICT x = vx;
8157
+ const block_q8_K * GGML_RESTRICT y = vy;
7429
8158
 
7430
8159
  const int nb = n / QK_K;
7431
8160
 
@@ -7445,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7445
8174
 
7446
8175
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
7447
8176
 
7448
- const uint8_t * restrict q6 = x[i].ql;
7449
- const uint8_t * restrict qh = x[i].qh;
7450
- const int8_t * restrict q8 = y[i].qs;
8177
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8178
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8179
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7451
8180
 
7452
- const int8_t * restrict scale = x[i].scales;
8181
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
7453
8182
 
7454
8183
  const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
7455
8184
  const int8x16_t scales = vld1q_s8(scale);
@@ -7536,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7536
8265
 
7537
8266
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7538
8267
 
7539
- const uint8_t * restrict q4 = x[i].ql;
7540
- const uint8_t * restrict qh = x[i].qh;
7541
- const int8_t * restrict q8 = y[i].qs;
8268
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8269
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8270
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7542
8271
 
7543
8272
  const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
7544
8273
 
@@ -7614,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7614
8343
 
7615
8344
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7616
8345
 
7617
- const uint8_t * restrict q4 = x[i].ql;
7618
- const uint8_t * restrict qh = x[i].qh;
7619
- const int8_t * restrict q8 = y[i].qs;
8346
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8347
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8348
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7620
8349
 
7621
8350
  // handle the q6_k -32 offset separately using bsums
7622
8351
  const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
@@ -7715,8 +8444,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7715
8444
 
7716
8445
  for (int i = 0; i < nb; ++i) {
7717
8446
  // Unpack 6-bit quantized data into aux8 (unchanged)
7718
- const uint8_t * restrict q4 = x[i].ql;
7719
- const uint8_t * restrict qh = x[i].qh;
8447
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8448
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
7720
8449
  int8_t * a = aux8;
7721
8450
  for (int j = 0; j < QK_K; j += 128) {
7722
8451
  for (int l = 0; l < 32; ++l) {
@@ -7730,8 +8459,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7730
8459
  qh += 32;
7731
8460
  }
7732
8461
 
7733
- const int8_t * restrict a_ptr = aux8;
7734
- const int8_t * restrict q8 = y[i].qs;
8462
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
8463
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7735
8464
  v128_t acc0 = wasm_i32x4_splat(0);
7736
8465
  v128_t acc1 = wasm_i32x4_splat(0);
7737
8466
 
@@ -7794,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7794
8523
 
7795
8524
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7796
8525
 
7797
- const uint8_t * restrict q6 = x[i].ql;
7798
- const uint8_t * restrict qh = x[i].qh;
7799
- const int8_t * restrict q8 = y[i].qs;
8526
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8527
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8528
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7800
8529
 
7801
- const int8_t * restrict scale = x[i].scales;
8530
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
7802
8531
 
7803
8532
  size_t vl;
7804
8533
 
@@ -7900,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7900
8629
  vector signed int vsumi6 = v0;
7901
8630
  vector signed int vsumi7 = v0;
7902
8631
 
7903
- const uint8_t * restrict q6 = x[i].ql;
7904
- const uint8_t * restrict qh = x[i].qh;
7905
- const int8_t * restrict qs = x[i].scales;
7906
- const int8_t * restrict q8 = y[i].qs;
8632
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
8633
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8634
+ const int8_t * GGML_RESTRICT qs = x[i].scales;
8635
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
7907
8636
 
7908
8637
  for (int j = 0; j < QK_K/128; ++j) {
7909
8638
  __builtin_prefetch(q6, 0, 0);
@@ -8019,9 +8748,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8019
8748
 
8020
8749
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
8021
8750
 
8022
- const uint8_t * restrict q4 = x[i].ql;
8023
- const uint8_t * restrict qh = x[i].qh;
8024
- const int8_t * restrict q8 = y[i].qs;
8751
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8752
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8753
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8025
8754
 
8026
8755
  const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
8027
8756
  const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
@@ -8068,7 +8797,130 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8068
8797
  }
8069
8798
 
8070
8799
  *s = hsum_float_8(acc);
8800
+ #elif defined(__VXE__) || defined(__VXE2__)
8801
+ float sum = 0;
8802
+
8803
+ // Lower 4-bit and upper 2-bit masks
8804
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
8805
+ const uint8x16_t v_um = vec_splat_u8(0x03);
8806
+
8807
+ const int32x4_t v_z = vec_splat_s32(0);
8808
+
8809
+ int8x16_t q6b[4];
8810
+ uint8x16_t q6h[4];
8811
+
8812
+ uint8x16_t v_xl[4];
8813
+ uint8x16_t v_xh[2];
8814
+ int8x16_t v_y[4];
8815
+
8816
+ for (int i = 0; i < nb; ++i) {
8817
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8818
+
8819
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
8820
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
8821
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
8071
8822
 
8823
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
8824
+
8825
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
8826
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
8827
+
8828
+ const int8x16_t v_scale = vec_xl(0, scale);
8829
+ const int16x8_t v_scalel = vec_unpackh(v_scale);
8830
+ const int16x8_t v_scaleh = vec_unpackl(v_scale);
8831
+
8832
+ const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
8833
+ const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
8834
+ const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
8835
+ const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
8836
+ const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
8837
+
8838
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
8839
+
8840
+ int32_t isum = 0;
8841
+ for (int j = 0; j < QK_K/128; ++j) {
8842
+ // Load model upper 2 bits
8843
+ v_xh[0] = vec_xl(0 , x0h);
8844
+ v_xh[1] = vec_xl(16, x0h);
8845
+ x0h += 32;
8846
+
8847
+ // Load model lower 4 bits
8848
+ v_xl[0] = vec_xl(0 , x0l);
8849
+ v_xl[1] = vec_xl(16, x0l);
8850
+ v_xl[2] = vec_xl(32, x0l);
8851
+ v_xl[3] = vec_xl(48, x0l);
8852
+ x0l += 64;
8853
+
8854
+ // Load activation quants
8855
+ v_y[0] = vec_xl(0 , y0);
8856
+ v_y[1] = vec_xl(16, y0);
8857
+ v_y[2] = vec_xl(32, y0);
8858
+ v_y[3] = vec_xl(48, y0);
8859
+ y0 += 64;
8860
+
8861
+ q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
8862
+ q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
8863
+ uint8x16_t shifted = vec_sr(v_xh[0], 2);
8864
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
8865
+ shifted = vec_sr(v_xh[1], 2);
8866
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
8867
+
8868
+ q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
8869
+ q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
8870
+ q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
8871
+ q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
8872
+
8873
+ int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
8874
+ int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
8875
+ int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
8876
+ int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
8877
+
8878
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
8879
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
8880
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
8881
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
8882
+
8883
+ scale += 4;
8884
+
8885
+
8886
+ // Load activation quants
8887
+ v_y[0] = vec_xl(0 , y0);
8888
+ v_y[1] = vec_xl(16, y0);
8889
+ v_y[2] = vec_xl(32, y0);
8890
+ v_y[3] = vec_xl(48, y0);
8891
+ y0 += 64;
8892
+
8893
+ shifted = vec_sr(v_xh[0], 4);
8894
+ q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
8895
+ shifted = vec_sr(v_xh[1], 4);
8896
+ q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
8897
+ shifted = vec_sr(v_xh[0], 6);
8898
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
8899
+ shifted = vec_sr(v_xh[1], 6);
8900
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
8901
+
8902
+ q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
8903
+ q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
8904
+ q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
8905
+ q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
8906
+
8907
+ summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
8908
+ summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
8909
+ summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
8910
+ summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
8911
+
8912
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
8913
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
8914
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
8915
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
8916
+
8917
+ scale += 4;
8918
+ }
8919
+
8920
+ sum += d_all * y[i].d * (isum - 32 * mins);
8921
+ }
8922
+
8923
+ *s = sum;
8072
8924
  #else
8073
8925
 
8074
8926
  int8_t aux8[QK_K];
@@ -8079,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8079
8931
 
8080
8932
  float sumf = 0;
8081
8933
  for (int i = 0; i < nb; ++i) {
8082
- const uint8_t * restrict q4 = x[i].ql;
8083
- const uint8_t * restrict qh = x[i].qh;
8084
- const int8_t * restrict q8 = y[i].qs;
8934
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
8935
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
8936
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8085
8937
  memset(aux32, 0, 8*sizeof(int32_t));
8086
- int8_t * restrict a = aux8;
8938
+ int8_t * GGML_RESTRICT a = aux8;
8087
8939
  for (int j = 0; j < QK_K; j += 128) {
8088
8940
  for (int l = 0; l < 32; ++l) {
8089
8941
  a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -8151,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = {
8151
9003
  };
8152
9004
  #endif
8153
9005
 
8154
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9006
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8155
9007
  assert(n % QK_K == 0);
8156
9008
  assert(nrc == 1);
8157
9009
  UNUSED(nrc);
@@ -8159,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8159
9011
  UNUSED(by);
8160
9012
  UNUSED(bs);
8161
9013
 
8162
- const block_iq2_xxs * restrict x = vx;
8163
- const block_q8_K * restrict y = vy;
9014
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
9015
+ const block_q8_K * GGML_RESTRICT y = vy;
8164
9016
 
8165
9017
  const int nb = n / QK_K;
8166
9018
 
@@ -8178,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8178
9030
  float sumf = 0;
8179
9031
  for (int i = 0; i < nb; ++i) {
8180
9032
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8181
- const uint16_t * restrict q2 = x[i].qs;
8182
- const int8_t * restrict q8 = y[i].qs;
9033
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9034
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8183
9035
  float sumf1 = 0, sumf2 = 0;
8184
9036
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8185
9037
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -8215,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8215
9067
  __m256 accumf = _mm256_setzero_ps();
8216
9068
  for (int i = 0; i < nb; ++i) {
8217
9069
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8218
- const uint16_t * restrict q2 = x[i].qs;
8219
- const int8_t * restrict q8 = y[i].qs;
9070
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9071
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8220
9072
  __m256i sumi1 = _mm256_setzero_si256();
8221
9073
  __m256i sumi2 = _mm256_setzero_si256();
8222
9074
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -8256,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8256
9108
  __m256 accumf = _mm256_setzero_ps();
8257
9109
  for (int i = 0; i < nb; ++i) {
8258
9110
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8259
- const uint16_t * restrict q2 = x[i].qs;
8260
- const int8_t * restrict q8 = y[i].qs;
9111
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9112
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8261
9113
  __m128i sumi1_0 = _mm_setzero_si128();
8262
9114
  __m128i sumi1_1 = _mm_setzero_si128();
8263
9115
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -8321,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8321
9173
  vector signed int vsumi2 = v0;
8322
9174
  vector signed int vsumi3 = v0;
8323
9175
 
8324
- const uint16_t * restrict q2 = x[i].qs;
8325
- const int8_t * restrict q8 = y[i].qs;
9176
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9177
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8326
9178
 
8327
9179
  for (int j = 0; j < QK_K/32; j += 2) {
8328
9180
  __builtin_prefetch(q2, 0, 1);
@@ -8398,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8398
9250
  __m256 accumf = (__m256)__lasx_xvldi(0);
8399
9251
  for (int i = 0; i < nb; ++i) {
8400
9252
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8401
- const uint16_t * restrict q2 = x[i].qs;
8402
- const int8_t * restrict q8 = y[i].qs;
9253
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9254
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8403
9255
  __m256i sumi1 = __lasx_xvldi(0);
8404
9256
  __m256i sumi2 = __lasx_xvldi(0);
8405
9257
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -8429,7 +9281,57 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8429
9281
  }
8430
9282
 
8431
9283
  *s = 0.125f * hsum_float_8(accumf);
8432
-
9284
+ //#elif defined(__VXE__) || defined(__VXE2__)
9285
+ // const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9286
+ //
9287
+ // uint32_t aux32[4];
9288
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
9289
+ //
9290
+ // float sumf = 0;
9291
+ //
9292
+ // for (int i = 0; i < nb; ++i) {
9293
+ // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9294
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9295
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
9296
+ //
9297
+ // float sumf1 = 0, sumf2 = 0;
9298
+ //
9299
+ // for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
9300
+ // int8x16_t q8b0 = vec_xl( 0, q8);
9301
+ // int8x16_t qb81 = vec_xl(16, q8);
9302
+ // int8x16_t q8b2 = vec_xl(32, q8);
9303
+ // int8x16_t q8b3 = vec_xl(48, q8);
9304
+ // q8 += 64;
9305
+ //
9306
+ // memcpy(aux32, q2, 4 * sizeof(uint32_t));
9307
+ // q2 += 8;
9308
+ //
9309
+ // int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
9310
+ // int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
9311
+ // int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
9312
+ // int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
9313
+ //
9314
+ // int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
9315
+ // int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
9316
+ // int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
9317
+ // int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
9318
+ //
9319
+ // q2u0 = vec_mul(q2u0, q2s0);
9320
+ // q2u1 = vec_mul(q2u1, q2s1);
9321
+ // q2u2 = vec_mul(q2u2, q2s2);
9322
+ // q2u3 = vec_mul(q2u3, q2s3);
9323
+ //
9324
+ // const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
9325
+ // const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
9326
+ //
9327
+ // sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
9328
+ // sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
9329
+ // }
9330
+ //
9331
+ // sumf += d * (sumf1 + sumf2);
9332
+ // }
9333
+ //
9334
+ // *s = 0.25f * sumf;
8433
9335
  #else
8434
9336
 
8435
9337
  uint32_t aux32[2];
@@ -8438,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8438
9340
  float sumf = 0.f;
8439
9341
  for (int i = 0; i < nb; ++i) {
8440
9342
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8441
- const uint16_t * restrict q2 = x[i].qs;
8442
- const int8_t * restrict q8 = y[i].qs;
9343
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9344
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8443
9345
  int32_t bsum = 0;
8444
9346
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
8445
9347
  memcpy(aux32, q2, 2*sizeof(uint32_t));
@@ -8462,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8462
9364
  #endif
8463
9365
  }
8464
9366
 
8465
- void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9367
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
8466
9368
  assert(n % QK_K == 0);
8467
9369
  assert(nrc == 1);
8468
9370
  UNUSED(nrc);
@@ -8470,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8470
9372
  UNUSED(by);
8471
9373
  UNUSED(bs);
8472
9374
 
8473
- const block_iq2_xs * restrict x = vx;
8474
- const block_q8_K * restrict y = vy;
9375
+ const block_iq2_xs * GGML_RESTRICT x = vx;
9376
+ const block_q8_K * GGML_RESTRICT y = vy;
8475
9377
 
8476
9378
  const int nb = n / QK_K;
8477
9379
 
@@ -8488,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8488
9390
  float sumf = 0;
8489
9391
  for (int i = 0; i < nb; ++i) {
8490
9392
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8491
- const uint16_t * restrict q2 = x[i].qs;
8492
- const int8_t * restrict q8 = y[i].qs;
9393
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9394
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8493
9395
  const uint8x8_t scales8 = vld1_u8(x[i].scales);
8494
9396
  const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
8495
9397
  const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
@@ -8566,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8566
9468
  __m256 accumf = _mm256_setzero_ps();
8567
9469
  for (int i = 0; i < nb; ++i) {
8568
9470
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8569
- const uint16_t * restrict q2 = x[i].qs;
8570
- const int8_t * restrict q8 = y[i].qs;
9471
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9472
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8571
9473
 
8572
9474
  memcpy(&aux64, x[i].scales, 8);
8573
9475
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -8687,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8687
9589
  __m256 accumf = _mm256_setzero_ps();
8688
9590
  for (int i = 0; i < nb; ++i) {
8689
9591
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8690
- const uint16_t * restrict q2 = x[i].qs;
8691
- const int8_t * restrict q8 = y[i].qs;
9592
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9593
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8692
9594
 
8693
9595
  memcpy(&aux64, x[i].scales, 8);
8694
9596
  __m128i stmp = _mm_set1_epi64x(aux64);
@@ -8842,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8842
9744
  __m256 accumf = (__m256)__lasx_xvldi(0);
8843
9745
  for (int i = 0; i < nb; ++i) {
8844
9746
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8845
- const uint16_t * restrict q2 = x[i].qs;
8846
- const int8_t * restrict q8 = y[i].qs;
9747
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9748
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8847
9749
 
8848
9750
  memcpy(&aux64, x[i].scales, 8);
8849
9751
  __m128i stmp = __lsx_vreplgr2vr_d(aux64);
@@ -8940,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8940
9842
  vector signed int vsumi2 = v0;
8941
9843
  vector signed int vsumi3 = v0;
8942
9844
 
8943
- const uint16_t * restrict q2 = x[i].qs;
8944
- const uint8_t * restrict sc = x[i].scales;
8945
- const int8_t * restrict q8 = y[i].qs;
9845
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9846
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9847
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
8946
9848
 
8947
9849
  for (int j = 0; j < QK_K/64; ++j) {
8948
9850
  __builtin_prefetch(q2, 0, 1);
@@ -9012,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9012
9914
  float sumf = 0.f;
9013
9915
  for (int i = 0; i < nb; ++i) {
9014
9916
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9015
- const uint16_t * restrict q2 = x[i].qs;
9016
- const uint8_t * restrict sc = x[i].scales;
9017
- const int8_t * restrict q8 = y[i].qs;
9917
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
9918
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
9919
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9018
9920
  int32_t bsum = 0;
9019
9921
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9020
9922
  const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
@@ -9047,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9047
9949
  #endif
9048
9950
  }
9049
9951
 
9050
- void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9952
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9051
9953
  assert(n % QK_K == 0);
9052
9954
  assert(nrc == 1);
9053
9955
  UNUSED(nrc);
@@ -9055,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9055
9957
  UNUSED(by);
9056
9958
  UNUSED(bs);
9057
9959
 
9058
- const block_iq2_s * restrict x = vx;
9059
- const block_q8_K * restrict y = vy;
9960
+ const block_iq2_s * GGML_RESTRICT x = vx;
9961
+ const block_q8_K * GGML_RESTRICT y = vy;
9060
9962
 
9061
9963
  const int nb = n / QK_K;
9062
9964
 
@@ -9082,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9082
9984
 
9083
9985
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9084
9986
 
9085
- const uint8_t * restrict qs = x[i].qs;
9086
- const uint8_t * restrict qh = x[i].qh;
9087
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9088
- const int8_t * restrict q8 = y[i].qs;
9987
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
9988
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
9989
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
9990
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9089
9991
 
9090
9992
  int sumi1 = 0, sumi2 = 0;
9091
9993
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9156,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9156
10058
  __m256 accumf = _mm256_setzero_ps();
9157
10059
  for (int i = 0; i < nb; ++i) {
9158
10060
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9159
- const uint8_t * restrict qs = x[i].qs;
9160
- const uint8_t * restrict qh = x[i].qh;
9161
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9162
- const int8_t * restrict q8 = y[i].qs;
10061
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10062
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10063
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10064
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9163
10065
 
9164
10066
  memcpy(&aux64, x[i].scales, 8);
9165
10067
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -9229,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9229
10131
  __m256 accumf = _mm256_setzero_ps();
9230
10132
  for (int i = 0; i < nb; ++i) {
9231
10133
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9232
- const uint8_t * restrict qs = x[i].qs;
9233
- const uint8_t * restrict qh = x[i].qh;
9234
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9235
- const int8_t * restrict q8 = y[i].qs;
10134
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10135
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10136
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10137
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9236
10138
 
9237
10139
  memcpy(&aux64, x[i].scales, 8);
9238
10140
  const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -9327,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9327
10229
  vector signed int vsumi2 = v0;
9328
10230
  vector signed int vsumi3 = v0;
9329
10231
 
9330
- const uint8_t * restrict q2 = x[i].qs;
9331
- const uint8_t * restrict qh = x[i].qh;
9332
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9333
- const uint8_t * restrict sc = x[i].scales;
9334
- const int8_t * restrict q8 = y[i].qs;
10232
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
10233
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10234
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10235
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
10236
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9335
10237
 
9336
10238
  for (int j = 0; j < QK_K/32; j += 2) {
9337
10239
  __builtin_prefetch(q2, 0, 1);
@@ -9428,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9428
10330
  __m256 accumf = (__m256)__lasx_xvldi(0);
9429
10331
  for (int i = 0; i < nb; ++i) {
9430
10332
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9431
- const uint8_t * restrict qs = x[i].qs;
9432
- const uint8_t * restrict qh = x[i].qh;
9433
- const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9434
- const int8_t * restrict q8 = y[i].qs;
10333
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10334
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10335
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
10336
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9435
10337
 
9436
10338
  __m128i tmp1;
9437
10339
  memcpy(&aux64, x[i].scales, 8);
@@ -9525,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9525
10427
 
9526
10428
  }
9527
10429
 
9528
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10430
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9529
10431
  assert(n % QK_K == 0);
9530
10432
  assert(nrc == 1);
9531
10433
  UNUSED(nrc);
@@ -9533,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9533
10435
  UNUSED(by);
9534
10436
  UNUSED(bs);
9535
10437
 
9536
- const block_iq3_xxs * restrict x = vx;
9537
- const block_q8_K * restrict y = vy;
10438
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
10439
+ const block_q8_K * GGML_RESTRICT y = vy;
9538
10440
 
9539
10441
  const int nb = n / QK_K;
9540
10442
 
@@ -9550,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9550
10452
  float sumf = 0;
9551
10453
  for (int i = 0; i < nb; ++i) {
9552
10454
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9553
- const uint8_t * restrict q3 = x[i].qs;
9554
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9555
- const int8_t * restrict q8 = y[i].qs;
10455
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10456
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10457
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9556
10458
  float sumf1 = 0, sumf2 = 0;
9557
10459
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9558
10460
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -9588,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9588
10490
  __m256 accumf = _mm256_setzero_ps();
9589
10491
  for (int i = 0; i < nb; ++i) {
9590
10492
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9591
- const uint8_t * restrict q3 = x[i].qs;
9592
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9593
- const int8_t * restrict q8 = y[i].qs;
10493
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10494
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10495
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9594
10496
  __m256i sumi1 = _mm256_setzero_si256();
9595
10497
  __m256i sumi2 = _mm256_setzero_si256();
9596
10498
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9633,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9633
10535
  __m256 accumf = _mm256_setzero_ps();
9634
10536
  for (int i = 0; i < nb; ++i) {
9635
10537
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9636
- const uint8_t * restrict q3 = x[i].qs;
9637
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9638
- const int8_t * restrict q8 = y[i].qs;
10538
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10539
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10540
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9639
10541
  __m128i sumi1_0 = _mm_setzero_si128();
9640
10542
  __m128i sumi1_1 = _mm_setzero_si128();
9641
10543
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -9702,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9702
10604
  vector signed int vsumi2 = v0;
9703
10605
  vector signed int vsumi3 = v0;
9704
10606
 
9705
- const uint8_t * restrict q3 = x[i].qs;
9706
- const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
9707
- const int8_t * restrict q8 = y[i].qs;
10607
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10608
+ const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
10609
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9708
10610
 
9709
10611
  #pragma GCC unroll 1
9710
10612
  for (int j = 0; j < QK_K/32; j += 2) {
@@ -9776,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9776
10678
  __m256 accumf = (__m256)__lasx_xvldi(0);
9777
10679
  for (int i = 0; i < nb; ++i) {
9778
10680
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9779
- const uint8_t * restrict q3 = x[i].qs;
9780
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9781
- const int8_t * restrict q8 = y[i].qs;
10681
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10682
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10683
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9782
10684
  __m256i sumi1 = __lasx_xvldi(0);
9783
10685
  __m256i sumi2 = __lasx_xvldi(0);
9784
10686
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9821,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9821
10723
  float sumf = 0.f;
9822
10724
  for (int i = 0; i < nb; ++i) {
9823
10725
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9824
- const uint8_t * restrict q3 = x[i].qs;
9825
- const uint8_t * restrict gas = x[i].qs + QK_K/4;
9826
- const int8_t * restrict q8 = y[i].qs;
10726
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
10727
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
10728
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9827
10729
  int32_t bsum = 0;
9828
10730
  for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
9829
10731
  memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
@@ -9848,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9848
10750
  #endif
9849
10751
  }
9850
10752
 
9851
- void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10753
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9852
10754
  assert(n % QK_K == 0);
9853
10755
  assert(nrc == 1);
9854
10756
  UNUSED(nrc);
@@ -9856,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9856
10758
  UNUSED(by);
9857
10759
  UNUSED(bs);
9858
10760
 
9859
- const block_iq3_s * restrict x = vx;
9860
- const block_q8_K * restrict y = vy;
10761
+ const block_iq3_s * GGML_RESTRICT x = vx;
10762
+ const block_q8_K * GGML_RESTRICT y = vy;
9861
10763
 
9862
10764
  const int nb = n / QK_K;
9863
10765
 
@@ -9894,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9894
10796
  float sumf = 0;
9895
10797
  for (int i = 0; i < nb; ++i) {
9896
10798
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9897
- const uint8_t * restrict qs = x[i].qs;
9898
- const uint8_t * restrict qh = x[i].qh;
9899
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9900
- const int8_t * restrict q8 = y[i].qs;
10799
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10800
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10801
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10802
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9901
10803
 
9902
10804
  memcpy(scales32, x[i].scales, 4);
9903
10805
  scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
@@ -9976,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9976
10878
  __m256 accumf = _mm256_setzero_ps();
9977
10879
  for (int i = 0; i < nb; ++i) {
9978
10880
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9979
- const uint8_t * restrict qs = x[i].qs;
9980
- const uint8_t * restrict qh = x[i].qh;
9981
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
9982
- const int8_t * restrict q8 = y[i].qs;
10881
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10882
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10883
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10884
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
9983
10885
  __m256i sumi1 = _mm256_setzero_si256();
9984
10886
  __m256i sumi2 = _mm256_setzero_si256();
9985
10887
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10061,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10061
10963
  __m256 accumf = _mm256_setzero_ps();
10062
10964
  for (int i = 0; i < nb; ++i) {
10063
10965
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10064
- const uint8_t * restrict qs = x[i].qs;
10065
- const uint8_t * restrict qh = x[i].qh;
10066
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10067
- const int8_t * restrict q8 = y[i].qs;
10966
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
10967
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
10968
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
10969
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10068
10970
  __m128i sumi1_0 = _mm_setzero_si128();
10069
10971
  __m128i sumi1_1 = _mm_setzero_si128();
10070
10972
  __m128i sumi2_0 = _mm_setzero_si128();
@@ -10162,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10162
11064
  vector float vyd = vec_splats(y[i].d);
10163
11065
  vector float vd = vec_mul(vxd, vyd);
10164
11066
 
10165
- const uint8_t * restrict q3 = x[i].qs;
10166
- const uint8_t * restrict qh = x[i].qh;
10167
- const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
10168
- const uint8_t * restrict sc = x[i].scales;
10169
- const int8_t * restrict q8 = y[i].qs;
11067
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
11068
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11069
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
11070
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
11071
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10170
11072
 
10171
11073
  vector signed int vsumi0 = v0;
10172
11074
  vector signed int vsumi1 = v0;
@@ -10273,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10273
11175
  __m256 accumf = (__m256)__lasx_xvldi(0);
10274
11176
  for (int i = 0; i < nb; ++i) {
10275
11177
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10276
- const uint8_t * restrict qs = x[i].qs;
10277
- const uint8_t * restrict qh = x[i].qh;
10278
- const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10279
- const int8_t * restrict q8 = y[i].qs;
11178
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11179
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11180
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
11181
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10280
11182
  __m256i sumi1 = __lasx_xvldi(0);
10281
11183
  __m256i sumi2 = __lasx_xvldi(0);
10282
11184
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10334,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10334
11236
  float sumf = 0.f;
10335
11237
  for (int i = 0; i < nb; ++i) {
10336
11238
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10337
- const uint8_t * restrict qs = x[i].qs;
10338
- const uint8_t * restrict qh = x[i].qh;
10339
- const uint8_t * restrict signs = x[i].signs;
10340
- const int8_t * restrict q8 = y[i].qs;
11239
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
11240
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
11241
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
11242
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
10341
11243
  int32_t bsum = 0;
10342
11244
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10343
11245
  const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
@@ -10389,7 +11291,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10389
11291
  }
10390
11292
  #endif
10391
11293
 
10392
- void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11294
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10393
11295
  assert(n % QK_K == 0);
10394
11296
  assert(nrc == 1);
10395
11297
  UNUSED(nrc);
@@ -10397,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10397
11299
  UNUSED(by);
10398
11300
  UNUSED(bs);
10399
11301
 
10400
- const block_iq1_s * restrict x = vx;
10401
- const block_q8_K * restrict y = vy;
11302
+ const block_iq1_s * GGML_RESTRICT x = vx;
11303
+ const block_q8_K * GGML_RESTRICT y = vy;
10402
11304
 
10403
11305
  const int nb = n / QK_K;
10404
11306
 
@@ -10460,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10460
11362
  __m256i sumi = _mm256_setzero_si256();
10461
11363
  int sumi1 = 0;
10462
11364
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11365
+ #ifdef __BMI2__
11366
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
11367
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
11368
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11369
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11370
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11371
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11372
+ #else
10463
11373
  const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
10464
11374
  iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
10465
11375
  const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
10466
11376
  iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11377
+ #endif
10467
11378
  qs += 8;
10468
11379
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10469
11380
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
@@ -10556,10 +11467,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10556
11467
  vector signed int vsumi3 = vec_splats((int32_t)0);
10557
11468
  vector signed int vsumi8 = vec_splats((int32_t)0);
10558
11469
 
10559
- const uint8_t * restrict q1 = x[i].qs;
10560
- const uint16_t * restrict qh = x[i].qh;
10561
- const int8_t * restrict q8 = y[i].qs;
10562
- const int16_t * restrict qs = y[i].bsums;
11470
+ const uint8_t * GGML_RESTRICT q1 = x[i].qs;
11471
+ const uint16_t * GGML_RESTRICT qh = x[i].qh;
11472
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
11473
+ const int16_t * GGML_RESTRICT qs = y[i].bsums;
10563
11474
 
10564
11475
  for (int j = 0; j < QK_K/32; j += 2) {
10565
11476
  __builtin_prefetch(q1, 0, 1);
@@ -10720,7 +11631,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10720
11631
  #endif
10721
11632
  }
10722
11633
 
10723
- void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11634
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10724
11635
  assert(n % QK_K == 0);
10725
11636
  assert(nrc == 1);
10726
11637
  UNUSED(nrc);
@@ -10728,8 +11639,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10728
11639
  UNUSED(by);
10729
11640
  UNUSED(bs);
10730
11641
 
10731
- const block_iq1_m * restrict x = vx;
10732
- const block_q8_K * restrict y = vy;
11642
+ const block_iq1_m * GGML_RESTRICT x = vx;
11643
+ const block_q8_K * GGML_RESTRICT y = vy;
10733
11644
 
10734
11645
  const int nb = n / QK_K;
10735
11646
 
@@ -10809,6 +11720,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10809
11720
 
10810
11721
  const __m256i mask = _mm256_set1_epi16(0x7);
10811
11722
  const __m256i mone = _mm256_set1_epi16(1);
11723
+ const __m256i mone8 = _mm256_set1_epi8(1);
11724
+ const __m256i mtwo8 = _mm256_set1_epi8(2);
11725
+ // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
11726
+ const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
10812
11727
 
10813
11728
  __m256 accum1 = _mm256_setzero_ps();
10814
11729
  __m256 accum2 = _mm256_setzero_ps();
@@ -10820,10 +11735,33 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10820
11735
  const uint16_t * sc = (const uint16_t *)x[i].scales;
10821
11736
 
10822
11737
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
11738
+ // Extract 3-bit scales (16 values)
11739
+ __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
11740
+ scales = _mm256_srlv_epi64(scales, scales_shift);
11741
+ scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
11742
+
11743
+ // Indices to repeat each scale 8 times.
11744
+ __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
11745
+ __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
10823
11746
 
10824
11747
  __m256i sumi1 = _mm256_setzero_si256();
10825
11748
  __m256i sumi2 = _mm256_setzero_si256();
10826
11749
  for (int ib = 0; ib < QK_K/32; ib += 2) {
11750
+ #ifdef __BMI2__
11751
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
11752
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
11753
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
11754
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
11755
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11756
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11757
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11758
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11759
+
11760
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
11761
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
11762
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
11763
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
11764
+ #else
10827
11765
  const __m256i q1b_1 = _mm256_set_epi64x(
10828
11766
  iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
10829
11767
  iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
@@ -10832,11 +11770,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10832
11770
  iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
10833
11771
  iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
10834
11772
  );
10835
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10836
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10837
-
10838
- const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
10839
- const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
10840
11773
 
10841
11774
  const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
10842
11775
  qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
@@ -10846,15 +11779,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
10846
11779
  qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
10847
11780
  qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
10848
11781
  qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11782
+ #endif
11783
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11784
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10849
11785
 
10850
- const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
10851
- const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
11786
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11787
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
11788
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
11789
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
11790
+
11791
+ __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
11792
+ __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
10852
11793
 
10853
- __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
10854
- __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
11794
+ scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
11795
+ scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
10855
11796
 
10856
- scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
10857
- scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
10858
11797
  const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
10859
11798
  const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
10860
11799
  const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
@@ -11010,7 +11949,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
11010
11949
  #endif
11011
11950
  }
11012
11951
 
11013
- void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
11952
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11014
11953
  assert(nrc == 1);
11015
11954
  UNUSED(nrc);
11016
11955
  UNUSED(bx);
@@ -11019,8 +11958,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11019
11958
  assert(n % QK4_NL == 0);
11020
11959
  static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
11021
11960
 
11022
- const block_iq4_nl * restrict x = vx;
11023
- const block_q8_0 * restrict y = vy;
11961
+ const block_iq4_nl * GGML_RESTRICT x = vx;
11962
+ const block_q8_0 * GGML_RESTRICT y = vy;
11024
11963
 
11025
11964
  const int nb = n / QK4_NL;
11026
11965
 
@@ -11190,6 +12129,27 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11190
12129
 
11191
12130
  sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
11192
12131
 
12132
+ #elif defined(__VXE__) || defined(__VXE2__)
12133
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
12134
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
12135
+
12136
+ for (; ib < nb; ++ib) {
12137
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
12138
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
12139
+
12140
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
12141
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
12142
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
12143
+
12144
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
12145
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
12146
+
12147
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
12148
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
12149
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
12150
+
12151
+ sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
12152
+ }
11193
12153
  #endif
11194
12154
  for (; ib < nb; ++ib) {
11195
12155
  const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
@@ -11203,7 +12163,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11203
12163
  *s = sumf;
11204
12164
  }
11205
12165
 
11206
- void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
12166
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
11207
12167
  assert(nrc == 1);
11208
12168
  UNUSED(nrc);
11209
12169
  UNUSED(bx);
@@ -11211,8 +12171,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11211
12171
  UNUSED(bs);
11212
12172
  assert(n % QK_K == 0);
11213
12173
 
11214
- const block_iq4_xs * restrict x = vx;
11215
- const block_q8_K * restrict y = vy;
12174
+ const block_iq4_xs * GGML_RESTRICT x = vx;
12175
+ const block_q8_K * GGML_RESTRICT y = vy;
11216
12176
 
11217
12177
  const int nb = n / QK_K;
11218
12178
 
@@ -11369,9 +12329,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11369
12329
 
11370
12330
  uint16_t h = x[ibl].scales_h;
11371
12331
 
11372
- const uint8_t * restrict q4 = x[ibl].qs;
11373
- const uint8_t * restrict sc = x[ibl].scales_l;
11374
- const int8_t * restrict q8 = y[ibl].qs;
12332
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12333
+ const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
12334
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
11375
12335
 
11376
12336
  for (int ib = 0; ib < QK_K/64; ib ++ ) {
11377
12337
  __builtin_prefetch(q4, 0, 1);
@@ -11468,6 +12428,56 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11468
12428
  }
11469
12429
 
11470
12430
  *s = hsum_float_8(accum);
12431
+ #elif defined(__VXE__) || defined(__VXE2__)
12432
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
12433
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
12434
+
12435
+ float sumf = 0;
12436
+
12437
+ for (int ibl = 0; ibl < nb; ++ibl) {
12438
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
12439
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
12440
+
12441
+ uint16_t h = x[ibl].scales_h;
12442
+
12443
+ int sumi1 = 0, sumi2 = 0;
12444
+ for (int ib = 0; ib < QK_K/64; ++ib) {
12445
+ const uint8x16_t v_x0 = vec_xl(0 , q4);
12446
+ const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
12447
+ q4 += 32;
12448
+
12449
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
12450
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
12451
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
12452
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
12453
+
12454
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
12455
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
12456
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
12457
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
12458
+
12459
+ const int8x16_t v_y0 = vec_xl( 0, q8);
12460
+ const int8x16_t v_y1 = vec_xl(16, q8);
12461
+ const int8x16_t v_y2 = vec_xl(32, q8);
12462
+ const int8x16_t v_y3 = vec_xl(48, q8);
12463
+ q8 += 64;
12464
+
12465
+ int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
12466
+ int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
12467
+
12468
+ int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
12469
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
12470
+
12471
+ h >>= 4;
12472
+
12473
+ sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
12474
+ sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
12475
+ }
12476
+
12477
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
12478
+ }
12479
+
12480
+ *s = sumf;
11471
12481
 
11472
12482
  #else
11473
12483
  float sumf = 0;
@@ -11506,12 +12516,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11506
12516
 
11507
12517
  // ============================ 4-bit non-linear quants
11508
12518
 
11509
- void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
12519
+ void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
11510
12520
  assert(k % QK4_NL == 0);
11511
12521
  quantize_row_iq4_nl_ref(x, y, k);
11512
12522
  }
11513
12523
 
11514
- void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
12524
+ void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
11515
12525
  assert(k % QK_K == 0);
11516
12526
  quantize_iq4_xs(x, y, 1, k, NULL);
11517
12527
  }