@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,9 +1,13 @@
1
- // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
1
+ // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
+ // SPDX-License-Identifier: MIT
3
+ //
4
+
2
5
  #define GGML_COMMON_IMPL_C
3
6
  #include "ggml-common.h"
4
7
 
5
8
  #include "ggml-quants.h"
6
9
  #include "ggml-impl.h"
10
+ #include "ggml-cpu-impl.h"
7
11
 
8
12
  #include <math.h>
9
13
  #include <string.h>
@@ -16,6 +20,8 @@
16
20
 
17
21
  #if defined(__GNUC__)
18
22
  #pragma GCC diagnostic ignored "-Woverlength-strings"
23
+ #elif defined(_MSC_VER)
24
+ #pragma warning(disable: 4244 4267) // possible loss of data
19
25
  #endif
20
26
 
21
27
  #define UNUSED GGML_UNUSED
@@ -34,6 +40,152 @@
34
40
  // from bias offset form to pure sign form (this saves subtract
35
41
  // operations durin unpacking)
36
42
  //
43
+ #if defined(__AVX__)
44
+ #if defined(__F16C__)
45
+ #if defined(__AVX512F__)
46
+ #define GGML_F32Cx8x2_LOAD(x, y) _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
47
+ #define GGML_F32Cx16_REPEAT_LOAD(x) _mm512_cvtph_ps(_mm256_set_m128i(x, x))
48
+ #endif
49
+ // the _mm256_cvt intrinsics require F16C
50
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
51
+ #define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
52
+ #define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
53
+ #else
54
+ #if defined(__AVX512F__)
55
+ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
56
+ float tmp[16];
57
+
58
+ for (int i = 0; i < 8; i++) {
59
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
60
+ }
61
+
62
+ for (int i = 0; i < 8; i++) {
63
+ tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
64
+ }
65
+
66
+ return _mm512_loadu_ps(tmp);
67
+ }
68
+ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
69
+ float tmp[16];
70
+ uint16_t tmphalf[8];
71
+ _mm_storeu_si128((__m128i*)tmphalf, x);
72
+
73
+ for (int i = 0; i < 4; i++) {
74
+ tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
75
+ tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
76
+ tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
77
+ tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
78
+ }
79
+
80
+ return _mm512_loadu_ps(tmp);
81
+ }
82
+ #endif
83
+ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
84
+ float tmp[8];
85
+
86
+ for (int i = 0; i < 8; i++) {
87
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
88
+ }
89
+
90
+ return _mm256_loadu_ps(tmp);
91
+ }
92
+ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
93
+ float tmp[8];
94
+
95
+ for (int i = 0; i < 4; i++) {
96
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
97
+ tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
98
+ }
99
+
100
+ return _mm256_loadu_ps(tmp);
101
+ }
102
+ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
103
+ uint16_t tmphalf[8];
104
+ float tmp[8];
105
+
106
+ _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
107
+ for (int i = 0; i < 8; i++) {
108
+ tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
109
+ }
110
+
111
+ return _mm256_loadu_ps(tmp);
112
+ }
113
+
114
+ #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
115
+ #define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) __avx_repeat_f32cx8_load(x)
116
+ #define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) __avx_rearranged_f32cx8_load(x, arrangeMask)
117
+ #if defined(__AVX512F__)
118
+ #define GGML_F32Cx8x2_LOAD(x, y) __avx512_f32cx8x2_load(x, y)
119
+ #define GGML_F32Cx16_REPEAT_LOAD(x) __avx512_repeat_f32cx16_load(x)
120
+ #endif
121
+ #endif
122
+ #endif
123
+
124
+
125
+ #if defined(__AVX2__) || defined(__AVX512F__)
126
+ #if defined(__AVX512F__)
127
+ // add int16_t pairwise and return as 512 bit int vector
128
+ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
129
+ const __m512i ones = _mm512_set1_epi16(1);
130
+ return _mm512_madd_epi16(ones, x);
131
+ }
132
+
133
+ static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
134
+ #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
135
+ const __m512i zero = _mm512_setzero_si512();
136
+ return _mm512_dpbusd_epi32(zero, ax, sy);
137
+ #else
138
+ // Perform multiplication and create 16-bit values
139
+ const __m512i dot = _mm512_maddubs_epi16(ax, sy);
140
+ return sum_i16_pairs_int_32x16(dot);
141
+ #endif
142
+ }
143
+
144
+ // multiply int8_t, add results pairwise twice and return as 512 bit int vector
145
+ static inline __m512i mul_sum_i8_pairs_int32x16(const __m512i x, const __m512i y) {
146
+ const __m512i zero = _mm512_setzero_si512();
147
+ // Get absolute values of x vectors
148
+ const __m512i ax = _mm512_abs_epi8(x);
149
+ // Sign the values of the y vectors
150
+ __mmask64 blt0 = _mm512_movepi8_mask(x);
151
+ const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
152
+ return mul_sum_us8_pairs_int32x16(ax, sy);
153
+ }
154
+ #endif
155
+
156
+ // add int16_t pairwise and return as 256 bit int vector
157
+ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
158
+ const __m256i ones = _mm256_set1_epi16(1);
159
+ return _mm256_madd_epi16(ones, x);
160
+ }
161
+
162
+ static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
163
+ #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
164
+ const __m256i zero = _mm256_setzero_si256();
165
+ return _mm256_dpbusd_epi32(zero, ax, sy);
166
+ #else
167
+ // Perform multiplication and create 16-bit values
168
+ const __m256i dot = _mm256_maddubs_epi16(ax, sy);
169
+ return sum_i16_pairs_int32x8(dot);
170
+ #endif
171
+ }
172
+
173
+ // Integer variant of the function defined in ggml-quants.c
174
+ // multiply int8_t, add results pairwise twice and return as 256 bit int vector
175
+ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) {
176
+ #if __AVXVNNIINT8__
177
+ const __m256i zero = _mm256_setzero_si256();
178
+ return _mm256_dpbssd_epi32(zero, x, y);
179
+ #else
180
+ // Get absolute values of x vectors
181
+ const __m256i ax = _mm256_sign_epi8(x, x);
182
+ // Sign the values of the y vectors
183
+ const __m256i sy = _mm256_sign_epi8(y, x);
184
+ return mul_sum_us8_pairs_int32x8(ax, sy);
185
+ #endif
186
+ }
187
+ #endif
188
+
37
189
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
38
190
  block_q4_0x4 out;
39
191
 
@@ -253,6 +405,103 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
253
405
  y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
254
406
  }
255
407
  }
408
+ #elif defined(__AVX2__) || defined(__AVX__)
409
+ float id[4];
410
+ __m256 srcv[4][4];
411
+ __m256 idvec[4];
412
+
413
+ for (int i = 0; i < nb; i++) {
414
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
415
+ // Load elements into 4 AVX vectors
416
+ __m256 v0 = _mm256_loadu_ps( x + row_iter * k + i * 32 );
417
+ __m256 v1 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 8 );
418
+ __m256 v2 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 16 );
419
+ __m256 v3 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 24 );
420
+
421
+ // Compute max(abs(e)) for the block
422
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
423
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
424
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
425
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
426
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
427
+
428
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
429
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
430
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
431
+ const float maxScalar = _mm_cvtss_f32( max4 );
432
+
433
+ // Divided by 127.f to mirror results in quantize_row_q8_0
434
+ const float d = maxScalar / 127.f;
435
+ id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
436
+
437
+ // Store the scale for the individual block
438
+ y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
439
+
440
+ // Store the values in blocks of eight values - Aim is to use these later for block interleaving
441
+ srcv[row_iter][0] = v0;
442
+ srcv[row_iter][1] = v1;
443
+ srcv[row_iter][2] = v2;
444
+ srcv[row_iter][3] = v3;
445
+ idvec[row_iter] = _mm256_set1_ps(id[row_iter]);
446
+ }
447
+
448
+ // The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
449
+ for (int j = 0; j < 4; j++) {
450
+ // Apply the multiplier
451
+ __m256 v0 = _mm256_mul_ps(srcv[0][j], idvec[0]);
452
+ __m256 v1 = _mm256_mul_ps(srcv[1][j], idvec[1]);
453
+ __m256 v2 = _mm256_mul_ps(srcv[2][j], idvec[2]);
454
+ __m256 v3 = _mm256_mul_ps(srcv[3][j], idvec[3]);
455
+
456
+ // Round to nearest integer
457
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
458
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
459
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
460
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
461
+
462
+ // Convert floats to integers
463
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
464
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
465
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
466
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
467
+
468
+ #if defined(__AVX2__)
469
+ // Convert int32 to int16
470
+ i0 = _mm256_packs_epi32( i0, i1 );
471
+ i2 = _mm256_packs_epi32( i2, i3 );
472
+ // Convert int16 to int8
473
+ i0 = _mm256_packs_epi16( i0, i2 );
474
+
475
+ // Permute and store the quantized weights in the required order after the pack instruction
476
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
477
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
478
+
479
+ _mm256_storeu_si256((__m256i *)(y[i].qs + 32 * j), i0);
480
+ #else
481
+ // Since we don't have in AVX some necessary functions,
482
+ // we split the registers in half and call AVX2 analogs from SSE
483
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
484
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
485
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
486
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
487
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
488
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
489
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
490
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
491
+
492
+ // Convert int32 to int16
493
+ ni0 = _mm_packs_epi32( ni0, ni1 );
494
+ ni2 = _mm_packs_epi32( ni2, ni3 );
495
+ ni4 = _mm_packs_epi32( ni4, ni5 );
496
+ ni6 = _mm_packs_epi32( ni6, ni7 );
497
+ // Convert int16 to int8
498
+ ni0 = _mm_packs_epi16( ni0, ni2 );
499
+ ni4 = _mm_packs_epi16( ni4, ni6 );
500
+ _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
501
+ _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
502
+ #endif
503
+ }
504
+ }
256
505
  #else
257
506
  // scalar
258
507
  const int blck_size_interleave = 8;
@@ -335,33 +584,18 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
335
584
  }
336
585
 
337
586
  size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
338
- if (!quant_weights) {
339
- return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
340
- }
341
- else {
342
- assert(false);
343
- return 0;
344
- }
587
+ UNUSED(quant_weights);
588
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
345
589
  }
346
590
 
347
591
  size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
348
- if (!quant_weights) {
349
- return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
350
- }
351
- else {
352
- assert(false);
353
- return 0;
354
- }
592
+ UNUSED(quant_weights);
593
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
355
594
  }
356
595
 
357
596
  size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
358
- if (!quant_weights) {
359
- return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
360
- }
361
- else {
362
- assert(false);
363
- return 0;
364
- }
597
+ UNUSED(quant_weights);
598
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
365
599
  }
366
600
 
367
601
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -383,73 +617,67 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
383
617
  UNUSED(ncols_interleaved);
384
618
  UNUSED(blocklen);
385
619
 
386
- #if defined(__ARM_FEATURE_SVE)
387
- if (svcntw() == 8) {
388
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
389
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
620
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
621
+ if (ggml_cpu_has_neon()) {
622
+ const void * b_ptr = vx;
623
+ const void * a_ptr = vy;
624
+ float * res_ptr = s;
625
+
626
+ __asm__ __volatile__(
627
+ "movi v31.16b, #0x4\n"
628
+ "movi v30.16b, #0xf0\n"
629
+ "add %x[b_ptr], %x[b_ptr], #0x8\n"
630
+ "1:" // Column loop
631
+ "add x22, %x[a_ptr], #0x2\n"
632
+ "movi v29.16b, #0x0\n"
633
+ "mov x21, %x[nb]\n"
634
+ "2:" // Block loop
635
+ "ldr q28, [%x[b_ptr], #0x0]\n"
636
+ "ldr q27, [x22, #0x0]\n"
637
+ "movi v26.4s, #0x0\n"
638
+ "sub x20, x22, #0x2\n"
639
+ "ldr q25, [x22, #0x10]\n"
640
+ "ldr q24, [%x[b_ptr], #0x10]\n"
641
+ "sub x21, x21, #0x1\n"
642
+ "add x22, x22, #0x22\n"
643
+ "ldr q23, [%x[b_ptr], #0x20]\n"
644
+ "ldr q22, [%x[b_ptr], #0x30]\n"
645
+ "ld1r { v21.8h }, [x20]\n"
646
+ "ldr q20, [%x[b_ptr], #-0x8]\n"
647
+ "sshl v16.16b, v28.16b, v31.16b\n"
648
+ "and v28.16b, v28.16b, v30.16b\n"
649
+ "sshl v19.16b, v24.16b, v31.16b\n"
650
+ "and v24.16b, v24.16b, v30.16b\n"
651
+ "add %x[b_ptr], %x[b_ptr], #0x48\n"
652
+ "sshl v18.16b, v23.16b, v31.16b\n"
653
+ "and v23.16b, v23.16b, v30.16b\n"
654
+ ".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
655
+ "sshl v17.16b, v22.16b, v31.16b\n"
656
+ "and v22.16b, v22.16b, v30.16b\n"
657
+ "fcvtl v21.4s, v21.4h\n"
658
+ "fcvtl v16.4s, v20.4h\n"
659
+ ".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
660
+ "fmul v16.4s, v16.4s, v21.4s\n"
661
+ ".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
662
+ ".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
663
+ ".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
664
+ ".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
665
+ ".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
666
+ ".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
667
+ "scvtf v26.4s, v26.4s, #0x4\n"
668
+ "fmla v29.4s, v26.4s, v16.4s\n"
669
+ "cbnz x21, 2b\n"
670
+ "sub %x[nc], %x[nc], #0x4\n"
671
+ "str q29, [%x[res_ptr], #0x0]\n"
672
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
673
+ "cbnz %x[nc], 1b\n"
674
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
675
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
676
+ : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
677
+ );
678
+ return;
390
679
  }
391
- #endif
392
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
393
- GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
394
- "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
395
- #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
396
- const void * b_ptr = vx;
397
- const void * a_ptr = vy;
398
- float * res_ptr = s;
399
-
400
- __asm__ __volatile__(
401
- "movi v31.16b, #0x4\n"
402
- "movi v30.16b, #0xf0\n"
403
- "add %x[b_ptr], %x[b_ptr], #0x8\n"
404
- "1:" // Column loop
405
- "add x22, %x[a_ptr], #0x2\n"
406
- "movi v29.16b, #0x0\n"
407
- "mov x21, %x[nb]\n"
408
- "2:" // Block loop
409
- "ldr q28, [%x[b_ptr], #0x0]\n"
410
- "ldr q27, [x22, #0x0]\n"
411
- "movi v26.4s, #0x0\n"
412
- "sub x20, x22, #0x2\n"
413
- "ldr q25, [x22, #0x10]\n"
414
- "ldr q24, [%x[b_ptr], #0x10]\n"
415
- "sub x21, x21, #0x1\n"
416
- "add x22, x22, #0x22\n"
417
- "ldr q23, [%x[b_ptr], #0x20]\n"
418
- "ldr q22, [%x[b_ptr], #0x30]\n"
419
- "ld1r { v21.8h }, [x20]\n"
420
- "ldr q20, [%x[b_ptr], #-0x8]\n"
421
- "sshl v16.16b, v28.16b, v31.16b\n"
422
- "and v28.16b, v28.16b, v30.16b\n"
423
- "sshl v19.16b, v24.16b, v31.16b\n"
424
- "and v24.16b, v24.16b, v30.16b\n"
425
- "add %x[b_ptr], %x[b_ptr], #0x48\n"
426
- "sshl v18.16b, v23.16b, v31.16b\n"
427
- "and v23.16b, v23.16b, v30.16b\n"
428
- ".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
429
- "sshl v17.16b, v22.16b, v31.16b\n"
430
- "and v22.16b, v22.16b, v30.16b\n"
431
- "fcvtl v21.4s, v21.4h\n"
432
- "fcvtl v16.4s, v20.4h\n"
433
- ".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
434
- "fmul v16.4s, v16.4s, v21.4s\n"
435
- ".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
436
- ".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
437
- ".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
438
- ".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
439
- ".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
440
- ".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
441
- "scvtf v26.4s, v26.4s, #0x4\n"
442
- "fmla v29.4s, v26.4s, v16.4s\n"
443
- "cbnz x21, 2b\n"
444
- "sub %x[nc], %x[nc], #0x4\n"
445
- "str q29, [%x[res_ptr], #0x0]\n"
446
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
447
- "cbnz %x[nc], 1b\n"
448
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
449
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
450
- : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
451
- );
452
- #else
680
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
453
681
  float sumf[4];
454
682
  int sumi;
455
683
 
@@ -473,7 +701,6 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
473
701
  }
474
702
  for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
475
703
  }
476
- #endif
477
704
  }
478
705
 
479
706
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -495,79 +722,72 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
495
722
  UNUSED(ncols_interleaved);
496
723
  UNUSED(blocklen);
497
724
 
498
- #if defined(__ARM_FEATURE_SVE)
499
- if (svcntw() == 8) {
500
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
501
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
725
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
726
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
727
+ const void * b_ptr = vx;
728
+ const void * a_ptr = vy;
729
+ float * res_ptr = s;
730
+
731
+ __asm__ __volatile__(
732
+ "movi v2.16b, #0x4\n"
733
+ "movi v1.16b, #0xf0\n"
734
+ "add %x[b_ptr], %x[b_ptr], #0x8\n"
735
+ "1:" // Column loop
736
+ "add x23, %x[a_ptr], #0x2\n"
737
+ "movi v0.16b, #0x0\n"
738
+ "mov x22, %x[nb]\n"
739
+ "2:" // Block loop
740
+ "ldr q31, [%x[b_ptr], #0x0]\n"
741
+ "ldr q30, [%x[b_ptr], #0x10]\n"
742
+ "mov x21, x23\n"
743
+ "movi v29.4s, #0x0\n"
744
+ "ldr q28, [%x[b_ptr], #0x20]\n"
745
+ "ldr q27, [%x[b_ptr], #0x30]\n"
746
+ "movi v26.4s, #0x0\n"
747
+ "sub x20, x23, #0x2\n"
748
+ "ld1r { v25.8h }, [x20]\n"
749
+ "ldr q24, [%x[b_ptr], #-0x8]\n"
750
+ "sub x22, x22, #0x1\n"
751
+ "add x23, x23, #0x22\n"
752
+ "ld1r { v23.2d }, [x21], #0x8\n"
753
+ "sshl v22.16b, v31.16b, v2.16b\n"
754
+ "sshl v16.16b, v30.16b, v2.16b\n"
755
+ "add %x[b_ptr], %x[b_ptr], #0x48\n"
756
+ "ld1r { v21.2d }, [x21], #0x8\n"
757
+ "sshl v20.16b, v28.16b, v2.16b\n"
758
+ "sshl v19.16b, v27.16b, v2.16b\n"
759
+ "ld1r { v18.2d }, [x21], #0x8\n"
760
+ "ld1r { v17.2d }, [x21], #0x8\n"
761
+ "and v31.16b, v31.16b, v1.16b\n"
762
+ "and v30.16b, v30.16b, v1.16b\n"
763
+ ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
764
+ ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
765
+ "and v28.16b, v28.16b, v1.16b\n"
766
+ "and v27.16b, v27.16b, v1.16b\n"
767
+ "fcvtl v25.4s, v25.4h\n"
768
+ "fcvtl v16.4s, v24.4h\n"
769
+ ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
770
+ ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
771
+ "fmul v16.4s, v16.4s, v25.4s\n"
772
+ ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
773
+ ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
774
+ ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
775
+ ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
776
+ "addp v29.4s, v29.4s, v26.4s\n"
777
+ "scvtf v29.4s, v29.4s, #0x4\n"
778
+ "fmla v0.4s, v29.4s, v16.4s\n"
779
+ "cbnz x22, 2b\n"
780
+ "sub %x[nc], %x[nc], #0x4\n"
781
+ "str q0, [%x[res_ptr], #0x0]\n"
782
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
783
+ "cbnz %x[nc], 1b\n"
784
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
785
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
786
+ : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
787
+ );
788
+ return;
502
789
  }
503
- #endif
504
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
505
- const void * b_ptr = vx;
506
- const void * a_ptr = vy;
507
- float * res_ptr = s;
508
-
509
- __asm__ __volatile__(
510
- "movi v2.16b, #0x4\n"
511
- "movi v1.16b, #0xf0\n"
512
- "add %x[b_ptr], %x[b_ptr], #0x8\n"
513
- "1:" // Column loop
514
- "add x23, %x[a_ptr], #0x2\n"
515
- "movi v0.16b, #0x0\n"
516
- "mov x22, %x[nb]\n"
517
- "2:" // Block loop
518
- "ldr q31, [%x[b_ptr], #0x0]\n"
519
- "ldr q30, [%x[b_ptr], #0x10]\n"
520
- "mov x21, x23\n"
521
- "movi v29.4s, #0x0\n"
522
- "ldr q28, [%x[b_ptr], #0x20]\n"
523
- "ldr q27, [%x[b_ptr], #0x30]\n"
524
- "movi v26.4s, #0x0\n"
525
- "sub x20, x23, #0x2\n"
526
- "ld1r { v25.8h }, [x20]\n"
527
- "ldr q24, [%x[b_ptr], #-0x8]\n"
528
- "sub x22, x22, #0x1\n"
529
- "add x23, x23, #0x22\n"
530
- "ld1r { v23.2d }, [x21], #0x8\n"
531
- "sshl v22.16b, v31.16b, v2.16b\n"
532
- "sshl v16.16b, v30.16b, v2.16b\n"
533
- "add %x[b_ptr], %x[b_ptr], #0x48\n"
534
- "ld1r { v21.2d }, [x21], #0x8\n"
535
- "sshl v20.16b, v28.16b, v2.16b\n"
536
- "sshl v19.16b, v27.16b, v2.16b\n"
537
- "ld1r { v18.2d }, [x21], #0x8\n"
538
- "ld1r { v17.2d }, [x21], #0x8\n"
539
- "and v31.16b, v31.16b, v1.16b\n"
540
- "and v30.16b, v30.16b, v1.16b\n"
541
- ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
542
- ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
543
- "and v28.16b, v28.16b, v1.16b\n"
544
- "and v27.16b, v27.16b, v1.16b\n"
545
- "fcvtl v25.4s, v25.4h\n"
546
- "fcvtl v16.4s, v24.4h\n"
547
- ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
548
- ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
549
- "fmul v16.4s, v16.4s, v25.4s\n"
550
- ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
551
- ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
552
- ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
553
- ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
554
- "addp v29.4s, v29.4s, v26.4s\n"
555
- "scvtf v29.4s, v29.4s, #0x4\n"
556
- "fmla v0.4s, v29.4s, v16.4s\n"
557
- "cbnz x22, 2b\n"
558
- "sub %x[nc], %x[nc], #0x4\n"
559
- "str q0, [%x[res_ptr], #0x0]\n"
560
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
561
- "cbnz %x[nc], 1b\n"
562
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
563
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
564
- : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
565
- );
566
- #elif defined(__ARM_NEON) && defined(__aarch64__)
567
- GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
568
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
569
- "performance");
570
- #else
790
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
571
791
  float sumf[4];
572
792
  int sumi;
573
793
 
@@ -591,7 +811,6 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
591
811
  }
592
812
  for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
593
813
  }
594
- #endif
595
814
  }
596
815
 
597
816
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -613,8 +832,9 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
613
832
  UNUSED(ncols_interleaved);
614
833
  UNUSED(blocklen);
615
834
 
616
- #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617
- if (svcntw() == 8) {
835
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
836
+ #if defined(__ARM_FEATURE_SVE)
837
+ if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
618
838
  const void * b_ptr = vx;
619
839
  const void * a_ptr = vy;
620
840
  float * res_ptr = s;
@@ -679,49 +899,124 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
679
899
  );
680
900
  return;
681
901
  }
682
- else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
683
- GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
684
- "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685
- "performance");
686
- }
687
- else if (ggml_cpu_has_neon()) {
688
- GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
689
- "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690
- "quantization format for optimal performance");
902
+ #endif // #if defined(__ARM_FEATURE_SVE)
903
+ #elif defined(__AVX2__)
904
+ // Lookup table to convert signed nibbles to signed bytes
905
+ __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
906
+ signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
907
+ __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
908
+ __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
909
+
910
+ // Permute mask used for easier vector processing at later stages
911
+ const __m256i m4b = _mm256_set1_epi8(0x0F);
912
+
913
+ int64_t b_nb = n / QK4_0;
914
+
915
+ const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
916
+ const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
917
+
918
+ // Process Q8_0 blocks one by one
919
+ for (int64_t y = 0; y < nr; y++) {
920
+
921
+ // Pointers to LHS blocks of block_q8_0 format
922
+ const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
923
+
924
+ // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
925
+ for (int64_t x = 0; x < nc / 8; x++) {
926
+
927
+ // Pointers to RHS blocks
928
+ const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
929
+
930
+ // Master FP accumulator
931
+ __m256 acc_row = _mm256_setzero_ps();
932
+
933
+ for (int64_t b = 0; b < nb; b++) {
934
+ // Load 8 blocks of Q4_0 interleaved as 8 bytes (B0 - B7)
935
+ const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
936
+ const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 1);
937
+ const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 2);
938
+ const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 3);
939
+
940
+ // 4-bit -> 8-bit - Sign is maintained
941
+ const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_0, m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
942
+ const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_0, m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
943
+ const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
944
+ const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
945
+
946
+ const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_0, 4), m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
947
+ const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_0, 4), m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
948
+ const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_1, 4), m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
949
+ const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_1, 4), m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
950
+
951
+ // Load the scale values for the 8 blocks interleaved in block_q4_0x8
952
+ const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
953
+
954
+ // Load and convert to FP32 scale from block_q8_0
955
+ const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
956
+
957
+ // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
958
+ __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
959
+ __m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16)));
960
+
961
+ lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
962
+ lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
963
+
964
+ __m256i iacc = _mm256_setzero_si256();
965
+
966
+ // Dot product done within 32 bit lanes and accumulated in the same vector
967
+ // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
968
+ // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
969
+ // ...........................................................................
970
+ // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
971
+
972
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
973
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
974
+
975
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
976
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
977
+
978
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
979
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
980
+
981
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
982
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
983
+
984
+ // Accumulated values multipled with appropriate scales
985
+ acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
986
+ }
987
+
988
+ // Accumulated output values permuted so as to be stored in appropriate order post accumulation
989
+ acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
990
+ _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
991
+ }
691
992
  }
692
- #endif
693
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
694
- GGML_ASSERT(ggml_cpu_has_sve() &&
695
- "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
696
- #elif defined(__ARM_NEON) && defined(__aarch64__)
697
- GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
698
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
699
- "performance");
700
- #else
701
- float sumf[8];
702
- int sumi;
993
+ return;
994
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
995
+ {
996
+ float sumf[8];
997
+ int sumi;
703
998
 
704
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
705
- for (int x = 0; x < nc / ncols_interleaved; x++) {
706
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
999
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1000
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1001
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
707
1002
 
708
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
709
- for (int l = 0; l < nb; l++) {
710
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
711
- for (int j = 0; j < ncols_interleaved; j++) {
712
- sumi = 0;
713
- for (int i = 0; i < blocklen; ++i) {
714
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
715
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
716
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1003
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1004
+ for (int l = 0; l < nb; l++) {
1005
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1006
+ for (int j = 0; j < ncols_interleaved; j++) {
1007
+ sumi = 0;
1008
+ for (int i = 0; i < blocklen; ++i) {
1009
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1010
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1011
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1012
+ }
1013
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
717
1014
  }
718
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
719
1015
  }
720
1016
  }
1017
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
721
1018
  }
722
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
723
1019
  }
724
- #endif
725
1020
  }
726
1021
 
727
1022
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -744,505 +1039,500 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
744
1039
  UNUSED(ncols_interleaved);
745
1040
  UNUSED(blocklen);
746
1041
 
747
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748
- if (svcntw() == 8) {
749
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
750
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751
- }
752
- #endif
753
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
754
- GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
755
- "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
756
- #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
757
- const void * b_ptr = vx;
758
- const void * a_ptr = vy;
759
- float * res_ptr = s;
760
- size_t res_stride = bs * sizeof(float);
761
-
762
- __asm__ __volatile__(
763
- "mov x10, %x[nr]\n"
764
- "mov x9, #0x88\n"
765
- "cmp x10, #0x10\n"
766
- "mul x9, %x[nb], x9\n"
767
- "blt 4f\n"
768
- "1:" // Row loop
769
- "add x28, %x[b_ptr], #0x8\n"
770
- "mov x27, %x[nc]\n"
771
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
772
- "2:" // Column loop
773
- "add x25, %x[a_ptr], #0x8\n"
774
- "movi v15.16b, #0x0\n"
775
- "movi v19.16b, #0x0\n"
776
- "mov x24, %x[nb]\n"
777
- "add x23, x25, x9\n"
778
- "movi v18.16b, #0x0\n"
779
- "movi v14.16b, #0x0\n"
780
- "add x22, x23, x9\n"
781
- "movi v11.16b, #0x0\n"
782
- "movi v13.16b, #0x0\n"
783
- "add x21, x22, x9\n"
784
- "movi v23.16b, #0x0\n"
785
- "movi v16.16b, #0x0\n"
786
- "movi v25.16b, #0x0\n"
787
- "movi v7.16b, #0x0\n"
788
- "movi v0.16b, #0x0\n"
789
- "movi v4.16b, #0x0\n"
790
- "movi v5.16b, #0x0\n"
791
- "movi v21.16b, #0x0\n"
792
- "movi v8.16b, #0x0\n"
793
- "movi v1.16b, #0x0\n"
794
- "3:" // Block loop
795
- "ldr q3, [x28, #0x0]\n"
796
- "ldr q31, [x25, #0x0]\n"
797
- "movi v28.16b, #0x4\n"
798
- "movi v10.4s, #0x0\n"
799
- "ldr q22, [x28, #0x10]\n"
800
- "ldr q6, [x25, #0x10]\n"
801
- "movi v29.4s, #0x0\n"
802
- "movi v9.4s, #0x0\n"
803
- "ldr q27, [x28, #0x20]\n"
804
- "ldr q30, [x28, #0x30]\n"
805
- "movi v20.4s, #0x0\n"
806
- "movi v24.16b, #0xf0\n"
807
- "ldr d2, [x25, #-0x8]\n"
808
- "ldr d26, [x23, #-0x8]\n"
809
- "sshl v12.16b, v3.16b, v28.16b\n"
810
- "sub x20, x28, #0x8\n"
811
- "ldr d17, [x20, #0x0]\n"
812
- "and v3.16b, v3.16b, v24.16b\n"
813
- "subs x24, x24, #0x1\n"
814
- "add x28, x28, #0x48\n"
815
- ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
816
- ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
817
- ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
818
- ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
819
- "sshl v31.16b, v22.16b, v28.16b\n"
820
- "and v22.16b, v22.16b, v24.16b\n"
821
- "fcvtl v17.4s, v17.4h\n"
822
- "fcvtl v2.4s, v2.4h\n"
823
- "fcvtl v26.4s, v26.4h\n"
824
- ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
825
- ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
826
- ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
827
- ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
828
- "sshl v6.16b, v27.16b, v28.16b\n"
829
- "sshl v28.16b, v30.16b, v28.16b\n"
830
- "and v27.16b, v27.16b, v24.16b\n"
831
- "and v30.16b, v30.16b, v24.16b\n"
832
- "ldr q24, [x25, #0x20]\n"
833
- ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
834
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
835
- ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
836
- ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
837
- "ldr q24, [x25, #0x30]\n"
838
- ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
839
- ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
840
- ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
841
- ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
842
- "ldr q24, [x25, #0x40]\n"
843
- ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
844
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
845
- ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
846
- ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
847
- "ldr q24, [x25, #0x50]\n"
848
- ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
849
- ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
850
- ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
851
- ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
852
- "ldr q24, [x25, #0x60]\n"
853
- ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
854
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
855
- ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
856
- ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
857
- "ldr q24, [x25, #0x70]\n"
858
- "add x25, x25, #0x88\n"
859
- ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
860
- ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
861
- ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
862
- ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
863
- "fmul v24.4s, v17.4s, v2.s[0]\n"
864
- "scvtf v10.4s, v10.4s, #0x4\n"
865
- "scvtf v29.4s, v29.4s, #0x4\n"
866
- "scvtf v9.4s, v9.4s, #0x4\n"
867
- "scvtf v20.4s, v20.4s, #0x4\n"
868
- "fmla v15.4s, v10.4s, v24.4s\n"
869
- "ldr q24, [x23, #0x0]\n"
870
- "fmul v10.4s, v17.4s, v2.s[1]\n"
871
- "fmla v19.4s, v29.4s, v10.4s\n"
872
- "ldr q10, [x23, #0x10]\n"
873
- "fmul v29.4s, v17.4s, v2.s[2]\n"
874
- "fmul v2.4s, v17.4s, v2.s[3]\n"
875
- "fmla v18.4s, v9.4s, v29.4s\n"
876
- "movi v9.4s, #0x0\n"
877
- "movi v29.4s, #0x0\n"
878
- ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
879
- ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
880
- "fmla v14.4s, v20.4s, v2.4s\n"
881
- "movi v20.4s, #0x0\n"
882
- "movi v2.4s, #0x0\n"
883
- ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
884
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
885
- "ldr q24, [x23, #0x20]\n"
886
- ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
887
- ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
888
- ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
889
- ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
890
- "ldr q10, [x23, #0x30]\n"
891
- ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
892
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
893
- ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
894
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
895
- "ldr q24, [x23, #0x40]\n"
896
- ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
897
- ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
898
- ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
899
- ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
900
- "ldr q10, [x23, #0x50]\n"
901
- ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
902
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
903
- ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
904
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
905
- "ldr q24, [x23, #0x60]\n"
906
- ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
907
- ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
908
- ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
909
- ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
910
- "ldr q10, [x23, #0x70]\n"
911
- "add x23, x23, #0x88\n"
912
- ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
913
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
914
- ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
915
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
916
- "ldr q24, [x22, #0x0]\n"
917
- ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
918
- ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
919
- ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
920
- ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
921
- "fmul v10.4s, v17.4s, v26.s[0]\n"
922
- "scvtf v9.4s, v9.4s, #0x4\n"
923
- "scvtf v29.4s, v29.4s, #0x4\n"
924
- "scvtf v20.4s, v20.4s, #0x4\n"
925
- "scvtf v2.4s, v2.4s, #0x4\n"
926
- "fmla v11.4s, v9.4s, v10.4s\n"
927
- "ldr q9, [x22, #0x10]\n"
928
- "fmul v10.4s, v17.4s, v26.s[1]\n"
929
- "fmla v13.4s, v29.4s, v10.4s\n"
930
- "ldr d29, [x22, #-0x8]\n"
931
- "fmul v10.4s, v17.4s, v26.s[2]\n"
932
- "fmul v26.4s, v17.4s, v26.s[3]\n"
933
- "fcvtl v29.4s, v29.4h\n"
934
- "fmla v23.4s, v20.4s, v10.4s\n"
935
- "movi v20.4s, #0x0\n"
936
- "movi v10.4s, #0x0\n"
937
- "fmla v16.4s, v2.4s, v26.4s\n"
938
- "movi v26.4s, #0x0\n"
939
- "movi v2.4s, #0x0\n"
940
- ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
941
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
942
- ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
943
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
944
- "ldr q24, [x22, #0x20]\n"
945
- ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
946
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
947
- ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
948
- ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
949
- "ldr q9, [x22, #0x30]\n"
950
- ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
951
- ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
952
- ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
953
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
954
- "ldr q24, [x22, #0x40]\n"
955
- ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
956
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
957
- ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
958
- ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
959
- "ldr q9, [x22, #0x50]\n"
960
- ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
961
- ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
962
- ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
963
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
964
- "ldr q24, [x22, #0x60]\n"
965
- ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
966
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
967
- ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
968
- ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
969
- "ldr q9, [x22, #0x70]\n"
970
- "add x22, x22, #0x88\n"
971
- ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
972
- ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
973
- ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
974
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
975
- "ldr q24, [x21, #0x0]\n"
976
- ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
977
- ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
978
- ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
979
- ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
980
- "fmul v9.4s, v17.4s, v29.s[0]\n"
981
- "scvtf v20.4s, v20.4s, #0x4\n"
982
- "scvtf v10.4s, v10.4s, #0x4\n"
983
- "scvtf v26.4s, v26.4s, #0x4\n"
984
- "scvtf v2.4s, v2.4s, #0x4\n"
985
- "fmla v25.4s, v20.4s, v9.4s\n"
986
- "ldr q9, [x21, #0x10]\n"
987
- "fmul v20.4s, v17.4s, v29.s[1]\n"
988
- "fmla v7.4s, v10.4s, v20.4s\n"
989
- "ldr d20, [x21, #-0x8]\n"
990
- "fmul v10.4s, v17.4s, v29.s[2]\n"
991
- "fmul v29.4s, v17.4s, v29.s[3]\n"
992
- "fcvtl v20.4s, v20.4h\n"
993
- "fmla v0.4s, v26.4s, v10.4s\n"
994
- "movi v26.4s, #0x0\n"
995
- "movi v10.4s, #0x0\n"
996
- "fmla v4.4s, v2.4s, v29.4s\n"
997
- "movi v2.4s, #0x0\n"
998
- "movi v29.4s, #0x0\n"
999
- ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
1000
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1001
- ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
1002
- ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
1003
- "ldr q12, [x21, #0x20]\n"
1004
- "fmul v24.4s, v17.4s, v20.s[0]\n"
1005
- ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
1006
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1007
- ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
1008
- ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
1009
- "ldr q9, [x21, #0x30]\n"
1010
- "fmul v31.4s, v17.4s, v20.s[1]\n"
1011
- ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
1012
- ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
1013
- ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
1014
- ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
1015
- "ldr q12, [x21, #0x40]\n"
1016
- "fmul v6.4s, v17.4s, v20.s[2]\n"
1017
- "fmul v20.4s, v17.4s, v20.s[3]\n"
1018
- ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
1019
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1020
- ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
1021
- ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
1022
- "ldr q9, [x21, #0x50]\n"
1023
- ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
1024
- ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
1025
- ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
1026
- ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
1027
- "ldr q12, [x21, #0x60]\n"
1028
- ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
1029
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1030
- ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
1031
- ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
1032
- "ldr q17, [x21, #0x70]\n"
1033
- "add x21, x21, #0x88\n"
1034
- ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
1035
- ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
1036
- ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
1037
- ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
1038
- ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
1039
- ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
1040
- ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
1041
- ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
1042
- "scvtf v26.4s, v26.4s, #0x4\n"
1043
- "scvtf v10.4s, v10.4s, #0x4\n"
1044
- "fmla v5.4s, v26.4s, v24.4s\n"
1045
- "scvtf v2.4s, v2.4s, #0x4\n"
1046
- "scvtf v29.4s, v29.4s, #0x4\n"
1047
- "fmla v21.4s, v10.4s, v31.4s\n"
1048
- "fmla v8.4s, v2.4s, v6.4s\n"
1049
- "fmla v1.4s, v29.4s, v20.4s\n"
1050
- "bgt 3b\n"
1051
- "mov x20, %x[res_ptr]\n"
1052
- "subs x27, x27, #0x4\n"
1053
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1054
- "str q15, [x20, #0x0]\n"
1055
- "add x20, x20, %x[res_stride]\n"
1056
- "str q19, [x20, #0x0]\n"
1057
- "add x20, x20, %x[res_stride]\n"
1058
- "str q18, [x20, #0x0]\n"
1059
- "add x20, x20, %x[res_stride]\n"
1060
- "str q14, [x20, #0x0]\n"
1061
- "add x20, x20, %x[res_stride]\n"
1062
- "str q11, [x20, #0x0]\n"
1063
- "add x20, x20, %x[res_stride]\n"
1064
- "str q13, [x20, #0x0]\n"
1065
- "add x20, x20, %x[res_stride]\n"
1066
- "str q23, [x20, #0x0]\n"
1067
- "add x20, x20, %x[res_stride]\n"
1068
- "str q16, [x20, #0x0]\n"
1069
- "add x20, x20, %x[res_stride]\n"
1070
- "str q25, [x20, #0x0]\n"
1071
- "add x20, x20, %x[res_stride]\n"
1072
- "str q7, [x20, #0x0]\n"
1073
- "add x20, x20, %x[res_stride]\n"
1074
- "str q0, [x20, #0x0]\n"
1075
- "add x20, x20, %x[res_stride]\n"
1076
- "str q4, [x20, #0x0]\n"
1077
- "add x20, x20, %x[res_stride]\n"
1078
- "str q5, [x20, #0x0]\n"
1079
- "add x20, x20, %x[res_stride]\n"
1080
- "str q21, [x20, #0x0]\n"
1081
- "add x20, x20, %x[res_stride]\n"
1082
- "str q8, [x20, #0x0]\n"
1083
- "add x20, x20, %x[res_stride]\n"
1084
- "str q1, [x20, #0x0]\n"
1085
- "bne 2b\n"
1086
- "mov x20, #0x4\n"
1087
- "sub x10, x10, #0x10\n"
1088
- "cmp x10, #0x10\n"
1089
- "mov %x[res_ptr], x26\n"
1090
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1091
- "bge 1b\n"
1092
- "4:" // Row loop skip
1093
- "cbz x10, 9f\n"
1094
- "5:" // Row tail: Row loop
1095
- "add x24, %x[b_ptr], #0x8\n"
1096
- "mov x23, %x[nc]\n"
1097
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1098
- "6:" // Row tail: Column loop
1099
- "movi v15.16b, #0x0\n"
1100
- "movi v19.16b, #0x0\n"
1101
- "add x25, %x[a_ptr], #0x8\n"
1102
- "mov x21, %x[nb]\n"
1103
- "movi v18.16b, #0x0\n"
1104
- "movi v14.16b, #0x0\n"
1105
- "7:" // Row tail: Block loop
1106
- "ldr q7, [x24, #0x0]\n"
1107
- "ldr q5, [x25, #0x0]\n"
1108
- "movi v9.16b, #0x4\n"
1109
- "movi v4.4s, #0x0\n"
1110
- "ldr q3, [x24, #0x10]\n"
1111
- "ldr q2, [x25, #0x10]\n"
1112
- "movi v1.4s, #0x0\n"
1113
- "movi v0.4s, #0x0\n"
1114
- "ldr q13, [x24, #0x20]\n"
1115
- "ldr q31, [x25, #0x20]\n"
1116
- "movi v30.4s, #0x0\n"
1117
- "movi v29.16b, #0xf0\n"
1118
- "ldr q28, [x24, #0x30]\n"
1119
- "ldr q27, [x25, #0x30]\n"
1120
- "sshl v20.16b, v7.16b, v9.16b\n"
1121
- "sub x20, x24, #0x8\n"
1122
- "ldr q26, [x25, #0x40]\n"
1123
- "ldr q25, [x25, #0x50]\n"
1124
- "sshl v17.16b, v3.16b, v9.16b\n"
1125
- "and v7.16b, v7.16b, v29.16b\n"
1126
- "ldr q24, [x25, #0x60]\n"
1127
- "ldr q16, [x25, #0x70]\n"
1128
- "sshl v22.16b, v13.16b, v9.16b\n"
1129
- "and v3.16b, v3.16b, v29.16b\n"
1130
- "ldr d21, [x20, #0x0]\n"
1131
- "ldr d12, [x25, #-0x8]\n"
1132
- ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1133
- ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1134
- ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1135
- ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1136
- "sshl v9.16b, v28.16b, v9.16b\n"
1137
- "subs x21, x21, #0x1\n"
1138
- "and v13.16b, v13.16b, v29.16b\n"
1139
- "and v28.16b, v28.16b, v29.16b\n"
1140
- "add x25, x25, #0x88\n"
1141
- "add x24, x24, #0x48\n"
1142
- "fcvtl v21.4s, v21.4h\n"
1143
- "fcvtl v12.4s, v12.4h\n"
1144
- ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1145
- ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1146
- ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1147
- ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1148
- "fmul v11.4s, v21.4s, v12.s[0]\n"
1149
- "fmul v23.4s, v21.4s, v12.s[1]\n"
1150
- "fmul v17.4s, v21.4s, v12.s[2]\n"
1151
- ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1152
- "fmul v6.4s, v21.4s, v12.s[3]\n"
1153
- ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1154
- ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1155
- ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1156
- ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1157
- ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1158
- ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1159
- ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1160
- ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1161
- ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1162
- ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1163
- ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1164
- ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1165
- ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1166
- ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1167
- ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1168
- ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1169
- ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1170
- ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1171
- ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1172
- ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1173
- ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1174
- ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1175
- ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1176
- "scvtf v4.4s, v4.4s, #0x4\n"
1177
- "scvtf v1.4s, v1.4s, #0x4\n"
1178
- "scvtf v0.4s, v0.4s, #0x4\n"
1179
- "fmla v15.4s, v4.4s, v11.4s\n"
1180
- "scvtf v30.4s, v30.4s, #0x4\n"
1181
- "fmla v19.4s, v1.4s, v23.4s\n"
1182
- "fmla v18.4s, v0.4s, v17.4s\n"
1183
- "fmla v14.4s, v30.4s, v6.4s\n"
1184
- "bgt 7b\n"
1185
- "mov x20, %x[res_ptr]\n"
1186
- "cmp x10, #0x1\n"
1187
- "str q15, [x20, #0x0]\n"
1188
- "add x20, x20, %x[res_stride]\n"
1189
- "ble 8f\n"
1190
- "cmp x10, #0x2\n"
1191
- "str q19, [x20, #0x0]\n"
1192
- "add x20, x20, %x[res_stride]\n"
1193
- "ble 8f\n"
1194
- "cmp x10, #0x3\n"
1195
- "str q18, [x20, #0x0]\n"
1196
- "add x20, x20, %x[res_stride]\n"
1197
- "ble 8f\n"
1198
- "str q14, [x20, #0x0]\n"
1199
- "8:" // Row tail: Accumulator store skip
1200
- "subs x23, x23, #0x4\n"
1201
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1202
- "bne 6b\n"
1203
- "subs x10, x10, #0x4\n"
1204
- "add %x[a_ptr], %x[a_ptr], x9\n"
1205
- "mov %x[res_ptr], x22\n"
1206
- "bgt 5b\n"
1207
- "9:" // Row tail: Row loop skip
1208
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1209
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1210
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1211
- );
1212
- #else
1213
- float sumf[4][4];
1214
- int sumi;
1042
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1043
+ if (ggml_cpu_has_neon()) {
1044
+ const void * b_ptr = vx;
1045
+ const void * a_ptr = vy;
1046
+ float * res_ptr = s;
1047
+ size_t res_stride = bs * sizeof(float);
1215
1048
 
1216
- for (int y = 0; y < nr / 4; y++) {
1217
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1218
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1219
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1220
- for (int m = 0; m < 4; m++) {
1221
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1222
- }
1223
- for (int l = 0; l < nb; l++) {
1224
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1225
- for (int m = 0; m < 4; m++) {
1226
- for (int j = 0; j < ncols_interleaved; j++) {
1227
- sumi = 0;
1228
- for (int i = 0; i < blocklen; ++i) {
1229
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1230
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1231
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1232
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1049
+ __asm__ __volatile__(
1050
+ "mov x10, %x[nr]\n"
1051
+ "mov x9, #0x88\n"
1052
+ "cmp x10, #0x10\n"
1053
+ "mul x9, %x[nb], x9\n"
1054
+ "blt 4f\n"
1055
+ "1:" // Row loop
1056
+ "add x28, %x[b_ptr], #0x8\n"
1057
+ "mov x27, %x[nc]\n"
1058
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1059
+ "2:" // Column loop
1060
+ "add x25, %x[a_ptr], #0x8\n"
1061
+ "movi v15.16b, #0x0\n"
1062
+ "movi v19.16b, #0x0\n"
1063
+ "mov x24, %x[nb]\n"
1064
+ "add x23, x25, x9\n"
1065
+ "movi v18.16b, #0x0\n"
1066
+ "movi v14.16b, #0x0\n"
1067
+ "add x22, x23, x9\n"
1068
+ "movi v11.16b, #0x0\n"
1069
+ "movi v13.16b, #0x0\n"
1070
+ "add x21, x22, x9\n"
1071
+ "movi v23.16b, #0x0\n"
1072
+ "movi v16.16b, #0x0\n"
1073
+ "movi v25.16b, #0x0\n"
1074
+ "movi v7.16b, #0x0\n"
1075
+ "movi v0.16b, #0x0\n"
1076
+ "movi v4.16b, #0x0\n"
1077
+ "movi v5.16b, #0x0\n"
1078
+ "movi v21.16b, #0x0\n"
1079
+ "movi v8.16b, #0x0\n"
1080
+ "movi v1.16b, #0x0\n"
1081
+ "3:" // Block loop
1082
+ "ldr q3, [x28, #0x0]\n"
1083
+ "ldr q31, [x25, #0x0]\n"
1084
+ "movi v28.16b, #0x4\n"
1085
+ "movi v10.4s, #0x0\n"
1086
+ "ldr q22, [x28, #0x10]\n"
1087
+ "ldr q6, [x25, #0x10]\n"
1088
+ "movi v29.4s, #0x0\n"
1089
+ "movi v9.4s, #0x0\n"
1090
+ "ldr q27, [x28, #0x20]\n"
1091
+ "ldr q30, [x28, #0x30]\n"
1092
+ "movi v20.4s, #0x0\n"
1093
+ "movi v24.16b, #0xf0\n"
1094
+ "ldr d2, [x25, #-0x8]\n"
1095
+ "ldr d26, [x23, #-0x8]\n"
1096
+ "sshl v12.16b, v3.16b, v28.16b\n"
1097
+ "sub x20, x28, #0x8\n"
1098
+ "ldr d17, [x20, #0x0]\n"
1099
+ "and v3.16b, v3.16b, v24.16b\n"
1100
+ "subs x24, x24, #0x1\n"
1101
+ "add x28, x28, #0x48\n"
1102
+ ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
1103
+ ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
1104
+ ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
1105
+ ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
1106
+ "sshl v31.16b, v22.16b, v28.16b\n"
1107
+ "and v22.16b, v22.16b, v24.16b\n"
1108
+ "fcvtl v17.4s, v17.4h\n"
1109
+ "fcvtl v2.4s, v2.4h\n"
1110
+ "fcvtl v26.4s, v26.4h\n"
1111
+ ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
1112
+ ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
1113
+ ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
1114
+ ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
1115
+ "sshl v6.16b, v27.16b, v28.16b\n"
1116
+ "sshl v28.16b, v30.16b, v28.16b\n"
1117
+ "and v27.16b, v27.16b, v24.16b\n"
1118
+ "and v30.16b, v30.16b, v24.16b\n"
1119
+ "ldr q24, [x25, #0x20]\n"
1120
+ ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
1121
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1122
+ ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
1123
+ ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
1124
+ "ldr q24, [x25, #0x30]\n"
1125
+ ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
1126
+ ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
1127
+ ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
1128
+ ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
1129
+ "ldr q24, [x25, #0x40]\n"
1130
+ ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
1131
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1132
+ ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
1133
+ ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
1134
+ "ldr q24, [x25, #0x50]\n"
1135
+ ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
1136
+ ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
1137
+ ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
1138
+ ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
1139
+ "ldr q24, [x25, #0x60]\n"
1140
+ ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
1141
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1142
+ ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
1143
+ ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
1144
+ "ldr q24, [x25, #0x70]\n"
1145
+ "add x25, x25, #0x88\n"
1146
+ ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
1147
+ ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
1148
+ ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
1149
+ ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
1150
+ "fmul v24.4s, v17.4s, v2.s[0]\n"
1151
+ "scvtf v10.4s, v10.4s, #0x4\n"
1152
+ "scvtf v29.4s, v29.4s, #0x4\n"
1153
+ "scvtf v9.4s, v9.4s, #0x4\n"
1154
+ "scvtf v20.4s, v20.4s, #0x4\n"
1155
+ "fmla v15.4s, v10.4s, v24.4s\n"
1156
+ "ldr q24, [x23, #0x0]\n"
1157
+ "fmul v10.4s, v17.4s, v2.s[1]\n"
1158
+ "fmla v19.4s, v29.4s, v10.4s\n"
1159
+ "ldr q10, [x23, #0x10]\n"
1160
+ "fmul v29.4s, v17.4s, v2.s[2]\n"
1161
+ "fmul v2.4s, v17.4s, v2.s[3]\n"
1162
+ "fmla v18.4s, v9.4s, v29.4s\n"
1163
+ "movi v9.4s, #0x0\n"
1164
+ "movi v29.4s, #0x0\n"
1165
+ ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
1166
+ ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
1167
+ "fmla v14.4s, v20.4s, v2.4s\n"
1168
+ "movi v20.4s, #0x0\n"
1169
+ "movi v2.4s, #0x0\n"
1170
+ ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
1171
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1172
+ "ldr q24, [x23, #0x20]\n"
1173
+ ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
1174
+ ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
1175
+ ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
1176
+ ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
1177
+ "ldr q10, [x23, #0x30]\n"
1178
+ ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
1179
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1180
+ ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
1181
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1182
+ "ldr q24, [x23, #0x40]\n"
1183
+ ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
1184
+ ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
1185
+ ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
1186
+ ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
1187
+ "ldr q10, [x23, #0x50]\n"
1188
+ ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
1189
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1190
+ ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
1191
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1192
+ "ldr q24, [x23, #0x60]\n"
1193
+ ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
1194
+ ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
1195
+ ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
1196
+ ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
1197
+ "ldr q10, [x23, #0x70]\n"
1198
+ "add x23, x23, #0x88\n"
1199
+ ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
1200
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1201
+ ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
1202
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1203
+ "ldr q24, [x22, #0x0]\n"
1204
+ ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
1205
+ ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
1206
+ ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
1207
+ ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
1208
+ "fmul v10.4s, v17.4s, v26.s[0]\n"
1209
+ "scvtf v9.4s, v9.4s, #0x4\n"
1210
+ "scvtf v29.4s, v29.4s, #0x4\n"
1211
+ "scvtf v20.4s, v20.4s, #0x4\n"
1212
+ "scvtf v2.4s, v2.4s, #0x4\n"
1213
+ "fmla v11.4s, v9.4s, v10.4s\n"
1214
+ "ldr q9, [x22, #0x10]\n"
1215
+ "fmul v10.4s, v17.4s, v26.s[1]\n"
1216
+ "fmla v13.4s, v29.4s, v10.4s\n"
1217
+ "ldr d29, [x22, #-0x8]\n"
1218
+ "fmul v10.4s, v17.4s, v26.s[2]\n"
1219
+ "fmul v26.4s, v17.4s, v26.s[3]\n"
1220
+ "fcvtl v29.4s, v29.4h\n"
1221
+ "fmla v23.4s, v20.4s, v10.4s\n"
1222
+ "movi v20.4s, #0x0\n"
1223
+ "movi v10.4s, #0x0\n"
1224
+ "fmla v16.4s, v2.4s, v26.4s\n"
1225
+ "movi v26.4s, #0x0\n"
1226
+ "movi v2.4s, #0x0\n"
1227
+ ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
1228
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1229
+ ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
1230
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1231
+ "ldr q24, [x22, #0x20]\n"
1232
+ ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
1233
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1234
+ ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
1235
+ ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
1236
+ "ldr q9, [x22, #0x30]\n"
1237
+ ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
1238
+ ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
1239
+ ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
1240
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1241
+ "ldr q24, [x22, #0x40]\n"
1242
+ ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
1243
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1244
+ ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
1245
+ ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
1246
+ "ldr q9, [x22, #0x50]\n"
1247
+ ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
1248
+ ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
1249
+ ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
1250
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1251
+ "ldr q24, [x22, #0x60]\n"
1252
+ ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
1253
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1254
+ ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
1255
+ ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
1256
+ "ldr q9, [x22, #0x70]\n"
1257
+ "add x22, x22, #0x88\n"
1258
+ ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
1259
+ ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
1260
+ ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
1261
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1262
+ "ldr q24, [x21, #0x0]\n"
1263
+ ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
1264
+ ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
1265
+ ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
1266
+ ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
1267
+ "fmul v9.4s, v17.4s, v29.s[0]\n"
1268
+ "scvtf v20.4s, v20.4s, #0x4\n"
1269
+ "scvtf v10.4s, v10.4s, #0x4\n"
1270
+ "scvtf v26.4s, v26.4s, #0x4\n"
1271
+ "scvtf v2.4s, v2.4s, #0x4\n"
1272
+ "fmla v25.4s, v20.4s, v9.4s\n"
1273
+ "ldr q9, [x21, #0x10]\n"
1274
+ "fmul v20.4s, v17.4s, v29.s[1]\n"
1275
+ "fmla v7.4s, v10.4s, v20.4s\n"
1276
+ "ldr d20, [x21, #-0x8]\n"
1277
+ "fmul v10.4s, v17.4s, v29.s[2]\n"
1278
+ "fmul v29.4s, v17.4s, v29.s[3]\n"
1279
+ "fcvtl v20.4s, v20.4h\n"
1280
+ "fmla v0.4s, v26.4s, v10.4s\n"
1281
+ "movi v26.4s, #0x0\n"
1282
+ "movi v10.4s, #0x0\n"
1283
+ "fmla v4.4s, v2.4s, v29.4s\n"
1284
+ "movi v2.4s, #0x0\n"
1285
+ "movi v29.4s, #0x0\n"
1286
+ ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
1287
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1288
+ ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
1289
+ ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
1290
+ "ldr q12, [x21, #0x20]\n"
1291
+ "fmul v24.4s, v17.4s, v20.s[0]\n"
1292
+ ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
1293
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1294
+ ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
1295
+ ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
1296
+ "ldr q9, [x21, #0x30]\n"
1297
+ "fmul v31.4s, v17.4s, v20.s[1]\n"
1298
+ ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
1299
+ ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
1300
+ ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
1301
+ ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
1302
+ "ldr q12, [x21, #0x40]\n"
1303
+ "fmul v6.4s, v17.4s, v20.s[2]\n"
1304
+ "fmul v20.4s, v17.4s, v20.s[3]\n"
1305
+ ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
1306
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1307
+ ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
1308
+ ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
1309
+ "ldr q9, [x21, #0x50]\n"
1310
+ ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
1311
+ ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
1312
+ ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
1313
+ ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
1314
+ "ldr q12, [x21, #0x60]\n"
1315
+ ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
1316
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1317
+ ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
1318
+ ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
1319
+ "ldr q17, [x21, #0x70]\n"
1320
+ "add x21, x21, #0x88\n"
1321
+ ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
1322
+ ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
1323
+ ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
1324
+ ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
1325
+ ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
1326
+ ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
1327
+ ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
1328
+ ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
1329
+ "scvtf v26.4s, v26.4s, #0x4\n"
1330
+ "scvtf v10.4s, v10.4s, #0x4\n"
1331
+ "fmla v5.4s, v26.4s, v24.4s\n"
1332
+ "scvtf v2.4s, v2.4s, #0x4\n"
1333
+ "scvtf v29.4s, v29.4s, #0x4\n"
1334
+ "fmla v21.4s, v10.4s, v31.4s\n"
1335
+ "fmla v8.4s, v2.4s, v6.4s\n"
1336
+ "fmla v1.4s, v29.4s, v20.4s\n"
1337
+ "bgt 3b\n"
1338
+ "mov x20, %x[res_ptr]\n"
1339
+ "subs x27, x27, #0x4\n"
1340
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1341
+ "str q15, [x20, #0x0]\n"
1342
+ "add x20, x20, %x[res_stride]\n"
1343
+ "str q19, [x20, #0x0]\n"
1344
+ "add x20, x20, %x[res_stride]\n"
1345
+ "str q18, [x20, #0x0]\n"
1346
+ "add x20, x20, %x[res_stride]\n"
1347
+ "str q14, [x20, #0x0]\n"
1348
+ "add x20, x20, %x[res_stride]\n"
1349
+ "str q11, [x20, #0x0]\n"
1350
+ "add x20, x20, %x[res_stride]\n"
1351
+ "str q13, [x20, #0x0]\n"
1352
+ "add x20, x20, %x[res_stride]\n"
1353
+ "str q23, [x20, #0x0]\n"
1354
+ "add x20, x20, %x[res_stride]\n"
1355
+ "str q16, [x20, #0x0]\n"
1356
+ "add x20, x20, %x[res_stride]\n"
1357
+ "str q25, [x20, #0x0]\n"
1358
+ "add x20, x20, %x[res_stride]\n"
1359
+ "str q7, [x20, #0x0]\n"
1360
+ "add x20, x20, %x[res_stride]\n"
1361
+ "str q0, [x20, #0x0]\n"
1362
+ "add x20, x20, %x[res_stride]\n"
1363
+ "str q4, [x20, #0x0]\n"
1364
+ "add x20, x20, %x[res_stride]\n"
1365
+ "str q5, [x20, #0x0]\n"
1366
+ "add x20, x20, %x[res_stride]\n"
1367
+ "str q21, [x20, #0x0]\n"
1368
+ "add x20, x20, %x[res_stride]\n"
1369
+ "str q8, [x20, #0x0]\n"
1370
+ "add x20, x20, %x[res_stride]\n"
1371
+ "str q1, [x20, #0x0]\n"
1372
+ "bne 2b\n"
1373
+ "mov x20, #0x4\n"
1374
+ "sub x10, x10, #0x10\n"
1375
+ "cmp x10, #0x10\n"
1376
+ "mov %x[res_ptr], x26\n"
1377
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1378
+ "bge 1b\n"
1379
+ "4:" // Row loop skip
1380
+ "cbz x10, 9f\n"
1381
+ "5:" // Row tail: Row loop
1382
+ "add x24, %x[b_ptr], #0x8\n"
1383
+ "mov x23, %x[nc]\n"
1384
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1385
+ "6:" // Row tail: Column loop
1386
+ "movi v15.16b, #0x0\n"
1387
+ "movi v19.16b, #0x0\n"
1388
+ "add x25, %x[a_ptr], #0x8\n"
1389
+ "mov x21, %x[nb]\n"
1390
+ "movi v18.16b, #0x0\n"
1391
+ "movi v14.16b, #0x0\n"
1392
+ "7:" // Row tail: Block loop
1393
+ "ldr q7, [x24, #0x0]\n"
1394
+ "ldr q5, [x25, #0x0]\n"
1395
+ "movi v9.16b, #0x4\n"
1396
+ "movi v4.4s, #0x0\n"
1397
+ "ldr q3, [x24, #0x10]\n"
1398
+ "ldr q2, [x25, #0x10]\n"
1399
+ "movi v1.4s, #0x0\n"
1400
+ "movi v0.4s, #0x0\n"
1401
+ "ldr q13, [x24, #0x20]\n"
1402
+ "ldr q31, [x25, #0x20]\n"
1403
+ "movi v30.4s, #0x0\n"
1404
+ "movi v29.16b, #0xf0\n"
1405
+ "ldr q28, [x24, #0x30]\n"
1406
+ "ldr q27, [x25, #0x30]\n"
1407
+ "sshl v20.16b, v7.16b, v9.16b\n"
1408
+ "sub x20, x24, #0x8\n"
1409
+ "ldr q26, [x25, #0x40]\n"
1410
+ "ldr q25, [x25, #0x50]\n"
1411
+ "sshl v17.16b, v3.16b, v9.16b\n"
1412
+ "and v7.16b, v7.16b, v29.16b\n"
1413
+ "ldr q24, [x25, #0x60]\n"
1414
+ "ldr q16, [x25, #0x70]\n"
1415
+ "sshl v22.16b, v13.16b, v9.16b\n"
1416
+ "and v3.16b, v3.16b, v29.16b\n"
1417
+ "ldr d21, [x20, #0x0]\n"
1418
+ "ldr d12, [x25, #-0x8]\n"
1419
+ ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1420
+ ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1421
+ ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1422
+ ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1423
+ "sshl v9.16b, v28.16b, v9.16b\n"
1424
+ "subs x21, x21, #0x1\n"
1425
+ "and v13.16b, v13.16b, v29.16b\n"
1426
+ "and v28.16b, v28.16b, v29.16b\n"
1427
+ "add x25, x25, #0x88\n"
1428
+ "add x24, x24, #0x48\n"
1429
+ "fcvtl v21.4s, v21.4h\n"
1430
+ "fcvtl v12.4s, v12.4h\n"
1431
+ ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1432
+ ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1433
+ ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1434
+ ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1435
+ "fmul v11.4s, v21.4s, v12.s[0]\n"
1436
+ "fmul v23.4s, v21.4s, v12.s[1]\n"
1437
+ "fmul v17.4s, v21.4s, v12.s[2]\n"
1438
+ ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1439
+ "fmul v6.4s, v21.4s, v12.s[3]\n"
1440
+ ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1441
+ ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1442
+ ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1443
+ ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1444
+ ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1445
+ ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1446
+ ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1447
+ ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1448
+ ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1449
+ ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1450
+ ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1451
+ ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1452
+ ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1453
+ ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1454
+ ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1455
+ ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1456
+ ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1457
+ ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1458
+ ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1459
+ ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1460
+ ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1461
+ ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1462
+ ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1463
+ "scvtf v4.4s, v4.4s, #0x4\n"
1464
+ "scvtf v1.4s, v1.4s, #0x4\n"
1465
+ "scvtf v0.4s, v0.4s, #0x4\n"
1466
+ "fmla v15.4s, v4.4s, v11.4s\n"
1467
+ "scvtf v30.4s, v30.4s, #0x4\n"
1468
+ "fmla v19.4s, v1.4s, v23.4s\n"
1469
+ "fmla v18.4s, v0.4s, v17.4s\n"
1470
+ "fmla v14.4s, v30.4s, v6.4s\n"
1471
+ "bgt 7b\n"
1472
+ "mov x20, %x[res_ptr]\n"
1473
+ "cmp x10, #0x1\n"
1474
+ "str q15, [x20, #0x0]\n"
1475
+ "add x20, x20, %x[res_stride]\n"
1476
+ "ble 8f\n"
1477
+ "cmp x10, #0x2\n"
1478
+ "str q19, [x20, #0x0]\n"
1479
+ "add x20, x20, %x[res_stride]\n"
1480
+ "ble 8f\n"
1481
+ "cmp x10, #0x3\n"
1482
+ "str q18, [x20, #0x0]\n"
1483
+ "add x20, x20, %x[res_stride]\n"
1484
+ "ble 8f\n"
1485
+ "str q14, [x20, #0x0]\n"
1486
+ "8:" // Row tail: Accumulator store skip
1487
+ "subs x23, x23, #0x4\n"
1488
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1489
+ "bne 6b\n"
1490
+ "subs x10, x10, #0x4\n"
1491
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1492
+ "mov %x[res_ptr], x22\n"
1493
+ "bgt 5b\n"
1494
+ "9:" // Row tail: Row loop skip
1495
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1496
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1497
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1498
+ );
1499
+ return;
1500
+ }
1501
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1502
+ {
1503
+ float sumf[4][4];
1504
+ int sumi;
1505
+
1506
+ for (int y = 0; y < nr / 4; y++) {
1507
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1508
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1509
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1510
+ for (int m = 0; m < 4; m++) {
1511
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1512
+ }
1513
+ for (int l = 0; l < nb; l++) {
1514
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1515
+ for (int m = 0; m < 4; m++) {
1516
+ for (int j = 0; j < ncols_interleaved; j++) {
1517
+ sumi = 0;
1518
+ for (int i = 0; i < blocklen; ++i) {
1519
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1520
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1521
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1522
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1523
+ }
1524
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1233
1525
  }
1234
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1235
1526
  }
1236
1527
  }
1237
1528
  }
1238
- }
1239
- for (int m = 0; m < 4; m++) {
1240
- for (int j = 0; j < ncols_interleaved; j++)
1241
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1529
+ for (int m = 0; m < 4; m++) {
1530
+ for (int j = 0; j < ncols_interleaved; j++)
1531
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1532
+ }
1242
1533
  }
1243
1534
  }
1244
1535
  }
1245
- #endif
1246
1536
  }
1247
1537
 
1248
1538
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -1265,413 +1555,406 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
1265
1555
  UNUSED(ncols_interleaved);
1266
1556
  UNUSED(blocklen);
1267
1557
 
1268
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269
- if (svcntw() == 8) {
1270
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
1271
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1558
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1559
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
1560
+ const void * b_ptr = vx;
1561
+ const void * a_ptr = vy;
1562
+ float * res_ptr = s;
1563
+ size_t res_stride = bs * sizeof(float);
1564
+
1565
+ __asm__ __volatile__(
1566
+ "mov x10, %x[nr]\n"
1567
+ "mov x9, #0x88\n"
1568
+ "cmp x10, #0x10\n"
1569
+ "mul x9, %x[nb], x9\n"
1570
+ "blt 4f\n"
1571
+ "1:" // Row loop
1572
+ "add x28, %x[b_ptr], #0x8\n"
1573
+ "mov x27, %x[nc]\n"
1574
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1575
+ "2:" // Column loop
1576
+ "add x25, %x[a_ptr], #0x8\n"
1577
+ "movi v2.16b, #0x0\n"
1578
+ "movi v10.16b, #0x0\n"
1579
+ "mov x24, %x[nb]\n"
1580
+ "add x23, x25, x9\n"
1581
+ "movi v12.16b, #0x0\n"
1582
+ "movi v28.16b, #0x0\n"
1583
+ "add x22, x23, x9\n"
1584
+ "movi v11.16b, #0x0\n"
1585
+ "movi v13.16b, #0x0\n"
1586
+ "add x21, x22, x9\n"
1587
+ "movi v22.16b, #0x0\n"
1588
+ "movi v23.16b, #0x0\n"
1589
+ "movi v25.16b, #0x0\n"
1590
+ "movi v5.16b, #0x0\n"
1591
+ "movi v7.16b, #0x0\n"
1592
+ "movi v4.16b, #0x0\n"
1593
+ "movi v6.16b, #0x0\n"
1594
+ "movi v30.16b, #0x0\n"
1595
+ "movi v24.16b, #0x0\n"
1596
+ "movi v14.16b, #0x0\n"
1597
+ "3:" // Block loop
1598
+ "ldr q21, [x28, #0x0]\n"
1599
+ "ldr q16, [x28, #0x10]\n"
1600
+ "movi v1.16b, #0x4\n"
1601
+ "movi v19.4s, #0x0\n"
1602
+ "ldr q27, [x25, #0x0]\n"
1603
+ "ldr q15, [x25, #0x10]\n"
1604
+ "movi v26.4s, #0x0\n"
1605
+ "movi v18.4s, #0x0\n"
1606
+ "ldr q29, [x28, #0x20]\n"
1607
+ "ldr q3, [x28, #0x30]\n"
1608
+ "movi v17.4s, #0x0\n"
1609
+ "movi v0.16b, #0xf0\n"
1610
+ "ldr d20, [x25, #-0x8]\n"
1611
+ "ldr d9, [x23, #-0x8]\n"
1612
+ "sshl v8.16b, v21.16b, v1.16b\n"
1613
+ "sshl v31.16b, v16.16b, v1.16b\n"
1614
+ "and v21.16b, v21.16b, v0.16b\n"
1615
+ "and v16.16b, v16.16b, v0.16b\n"
1616
+ "sub x20, x28, #0x8\n"
1617
+ "subs x24, x24, #0x1\n"
1618
+ "add x28, x28, #0x48\n"
1619
+ ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1620
+ ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1621
+ "ldr q27, [x25, #0x20]\n"
1622
+ ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1623
+ ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1624
+ "sshl v15.16b, v29.16b, v1.16b\n"
1625
+ "sshl v1.16b, v3.16b, v1.16b\n"
1626
+ "and v29.16b, v29.16b, v0.16b\n"
1627
+ "and v3.16b, v3.16b, v0.16b\n"
1628
+ "ldr q0, [x25, #0x30]\n"
1629
+ "fcvtl v20.4s, v20.4h\n"
1630
+ ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1631
+ "fcvtl v9.4s, v9.4h\n"
1632
+ ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1633
+ "ldr q27, [x25, #0x40]\n"
1634
+ ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1635
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1636
+ "ldr q0, [x25, #0x50]\n"
1637
+ ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1638
+ ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1639
+ "ldr q27, [x25, #0x60]\n"
1640
+ ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1641
+ ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1642
+ "ldr q0, [x25, #0x70]\n"
1643
+ "add x25, x25, #0x88\n"
1644
+ ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1645
+ ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1646
+ "ldr d27, [x20, #0x0]\n"
1647
+ ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1648
+ ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1649
+ "fcvtl v27.4s, v27.4h\n"
1650
+ "uzp1 v0.2d, v19.2d, v26.2d\n"
1651
+ "uzp2 v26.2d, v19.2d, v26.2d\n"
1652
+ "fmul v19.4s, v27.4s, v20.s[0]\n"
1653
+ "scvtf v0.4s, v0.4s, #0x4\n"
1654
+ "scvtf v26.4s, v26.4s, #0x4\n"
1655
+ "fmla v2.4s, v0.4s, v19.4s\n"
1656
+ "ldr q19, [x23, #0x0]\n"
1657
+ "uzp1 v0.2d, v18.2d, v17.2d\n"
1658
+ "uzp2 v18.2d, v18.2d, v17.2d\n"
1659
+ "fmul v17.4s, v27.4s, v20.s[1]\n"
1660
+ "scvtf v0.4s, v0.4s, #0x4\n"
1661
+ "scvtf v18.4s, v18.4s, #0x4\n"
1662
+ "fmla v10.4s, v26.4s, v17.4s\n"
1663
+ "ldr q17, [x23, #0x10]\n"
1664
+ "fmul v26.4s, v27.4s, v20.s[2]\n"
1665
+ "fmul v20.4s, v27.4s, v20.s[3]\n"
1666
+ "fmla v12.4s, v0.4s, v26.4s\n"
1667
+ "ldr d0, [x22, #-0x8]\n"
1668
+ "ldr d26, [x21, #-0x8]\n"
1669
+ "fcvtl v0.4s, v0.4h\n"
1670
+ "fmla v28.4s, v18.4s, v20.4s\n"
1671
+ "movi v20.4s, #0x0\n"
1672
+ "movi v18.4s, #0x0\n"
1673
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1674
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1675
+ "ldr q19, [x23, #0x20]\n"
1676
+ "fcvtl v26.4s, v26.4h\n"
1677
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1678
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1679
+ "ldr q19, [x23, #0x40]\n"
1680
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1681
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1682
+ "ldr q19, [x23, #0x60]\n"
1683
+ ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1684
+ ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1685
+ "uzp1 v19.2d, v20.2d, v18.2d\n"
1686
+ "scvtf v19.4s, v19.4s, #0x4\n"
1687
+ "uzp2 v20.2d, v20.2d, v18.2d\n"
1688
+ "fmul v18.4s, v27.4s, v9.s[0]\n"
1689
+ "scvtf v20.4s, v20.4s, #0x4\n"
1690
+ "fmla v11.4s, v19.4s, v18.4s\n"
1691
+ "ldr q18, [x22, #0x0]\n"
1692
+ "fmul v19.4s, v27.4s, v9.s[1]\n"
1693
+ "fmla v13.4s, v20.4s, v19.4s\n"
1694
+ "movi v19.4s, #0x0\n"
1695
+ "movi v20.4s, #0x0\n"
1696
+ ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1697
+ ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1698
+ "ldr q17, [x23, #0x30]\n"
1699
+ ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1700
+ ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1701
+ "ldr q17, [x23, #0x50]\n"
1702
+ ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1703
+ ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1704
+ "ldr q17, [x23, #0x70]\n"
1705
+ "add x23, x23, #0x88\n"
1706
+ ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1707
+ ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1708
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1709
+ "scvtf v17.4s, v17.4s, #0x4\n"
1710
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1711
+ "fmul v19.4s, v27.4s, v9.s[2]\n"
1712
+ "fmul v9.4s, v27.4s, v9.s[3]\n"
1713
+ "scvtf v20.4s, v20.4s, #0x4\n"
1714
+ "fmla v22.4s, v17.4s, v19.4s\n"
1715
+ "ldr q17, [x22, #0x10]\n"
1716
+ "movi v19.4s, #0x0\n"
1717
+ ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1718
+ "fmla v23.4s, v20.4s, v9.4s\n"
1719
+ "movi v20.4s, #0x0\n"
1720
+ "movi v9.4s, #0x0\n"
1721
+ ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1722
+ "ldr q18, [x22, #0x20]\n"
1723
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1724
+ ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1725
+ ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1726
+ "ldr q18, [x22, #0x40]\n"
1727
+ ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1728
+ ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1729
+ "ldr q18, [x22, #0x60]\n"
1730
+ ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1731
+ ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1732
+ "movi v18.4s, #0x0\n"
1733
+ ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1734
+ "ldr q17, [x22, #0x30]\n"
1735
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1736
+ ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1737
+ "ldr q17, [x22, #0x50]\n"
1738
+ ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1739
+ ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1740
+ "ldr q17, [x22, #0x70]\n"
1741
+ "add x22, x22, #0x88\n"
1742
+ ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1743
+ ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1744
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1745
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1746
+ "fmul v19.4s, v27.4s, v0.s[0]\n"
1747
+ "scvtf v17.4s, v17.4s, #0x4\n"
1748
+ "scvtf v20.4s, v20.4s, #0x4\n"
1749
+ "fmla v25.4s, v17.4s, v19.4s\n"
1750
+ "ldr q19, [x21, #0x0]\n"
1751
+ "fmul v17.4s, v27.4s, v0.s[1]\n"
1752
+ "fmla v5.4s, v20.4s, v17.4s\n"
1753
+ "ldr q17, [x21, #0x10]\n"
1754
+ "uzp1 v20.2d, v9.2d, v18.2d\n"
1755
+ "uzp2 v9.2d, v9.2d, v18.2d\n"
1756
+ "fmul v18.4s, v27.4s, v0.s[2]\n"
1757
+ "fmul v0.4s, v27.4s, v0.s[3]\n"
1758
+ "scvtf v20.4s, v20.4s, #0x4\n"
1759
+ "scvtf v9.4s, v9.4s, #0x4\n"
1760
+ "fmla v7.4s, v20.4s, v18.4s\n"
1761
+ "movi v20.4s, #0x0\n"
1762
+ "movi v18.4s, #0x0\n"
1763
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1764
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1765
+ "ldr q19, [x21, #0x20]\n"
1766
+ "fmla v4.4s, v9.4s, v0.4s\n"
1767
+ "movi v9.4s, #0x0\n"
1768
+ "movi v0.4s, #0x0\n"
1769
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1770
+ "fmul v8.4s, v27.4s, v26.s[0]\n"
1771
+ ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1772
+ "ldr q17, [x21, #0x30]\n"
1773
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1774
+ "fmul v31.4s, v27.4s, v26.s[1]\n"
1775
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1776
+ "ldr q19, [x21, #0x40]\n"
1777
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1778
+ "fmul v15.4s, v27.4s, v26.s[2]\n"
1779
+ "fmul v27.4s, v27.4s, v26.s[3]\n"
1780
+ ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1781
+ "ldr q1, [x21, #0x50]\n"
1782
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1783
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1784
+ "ldr q26, [x21, #0x60]\n"
1785
+ ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1786
+ ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1787
+ "ldr q21, [x21, #0x70]\n"
1788
+ "add x21, x21, #0x88\n"
1789
+ ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1790
+ ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1791
+ ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1792
+ ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1793
+ "uzp1 v29.2d, v20.2d, v18.2d\n"
1794
+ "uzp2 v21.2d, v20.2d, v18.2d\n"
1795
+ "scvtf v29.4s, v29.4s, #0x4\n"
1796
+ "uzp1 v18.2d, v9.2d, v0.2d\n"
1797
+ "uzp2 v16.2d, v9.2d, v0.2d\n"
1798
+ "scvtf v21.4s, v21.4s, #0x4\n"
1799
+ "fmla v6.4s, v29.4s, v8.4s\n"
1800
+ "scvtf v18.4s, v18.4s, #0x4\n"
1801
+ "scvtf v16.4s, v16.4s, #0x4\n"
1802
+ "fmla v30.4s, v21.4s, v31.4s\n"
1803
+ "fmla v24.4s, v18.4s, v15.4s\n"
1804
+ "fmla v14.4s, v16.4s, v27.4s\n"
1805
+ "bgt 3b\n"
1806
+ "mov x20, %x[res_ptr]\n"
1807
+ "subs x27, x27, #0x4\n"
1808
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1809
+ "str q2, [x20, #0x0]\n"
1810
+ "add x20, x20, %x[res_stride]\n"
1811
+ "str q10, [x20, #0x0]\n"
1812
+ "add x20, x20, %x[res_stride]\n"
1813
+ "str q12, [x20, #0x0]\n"
1814
+ "add x20, x20, %x[res_stride]\n"
1815
+ "str q28, [x20, #0x0]\n"
1816
+ "add x20, x20, %x[res_stride]\n"
1817
+ "str q11, [x20, #0x0]\n"
1818
+ "add x20, x20, %x[res_stride]\n"
1819
+ "str q13, [x20, #0x0]\n"
1820
+ "add x20, x20, %x[res_stride]\n"
1821
+ "str q22, [x20, #0x0]\n"
1822
+ "add x20, x20, %x[res_stride]\n"
1823
+ "str q23, [x20, #0x0]\n"
1824
+ "add x20, x20, %x[res_stride]\n"
1825
+ "str q25, [x20, #0x0]\n"
1826
+ "add x20, x20, %x[res_stride]\n"
1827
+ "str q5, [x20, #0x0]\n"
1828
+ "add x20, x20, %x[res_stride]\n"
1829
+ "str q7, [x20, #0x0]\n"
1830
+ "add x20, x20, %x[res_stride]\n"
1831
+ "str q4, [x20, #0x0]\n"
1832
+ "add x20, x20, %x[res_stride]\n"
1833
+ "str q6, [x20, #0x0]\n"
1834
+ "add x20, x20, %x[res_stride]\n"
1835
+ "str q30, [x20, #0x0]\n"
1836
+ "add x20, x20, %x[res_stride]\n"
1837
+ "str q24, [x20, #0x0]\n"
1838
+ "add x20, x20, %x[res_stride]\n"
1839
+ "str q14, [x20, #0x0]\n"
1840
+ "bne 2b\n"
1841
+ "mov x20, #0x4\n"
1842
+ "sub x10, x10, #0x10\n"
1843
+ "cmp x10, #0x10\n"
1844
+ "mov %x[res_ptr], x26\n"
1845
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1846
+ "bge 1b\n"
1847
+ "4:" // Row loop skip
1848
+ "cbz x10, 9f\n"
1849
+ "5:" // Row tail: Row loop
1850
+ "add x24, %x[b_ptr], #0x8\n"
1851
+ "mov x23, %x[nc]\n"
1852
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1853
+ "6:" // Row tail: Column loop
1854
+ "movi v2.16b, #0x0\n"
1855
+ "movi v10.16b, #0x0\n"
1856
+ "add x25, %x[a_ptr], #0x8\n"
1857
+ "mov x21, %x[nb]\n"
1858
+ "movi v12.16b, #0x0\n"
1859
+ "movi v28.16b, #0x0\n"
1860
+ "7:" // Row tail: Block loop
1861
+ "ldr q6, [x24, #0x0]\n"
1862
+ "ldr q5, [x24, #0x10]\n"
1863
+ "movi v17.16b, #0x4\n"
1864
+ "movi v8.4s, #0x0\n"
1865
+ "ldr q4, [x25, #0x0]\n"
1866
+ "ldr q13, [x25, #0x10]\n"
1867
+ "movi v27.4s, #0x0\n"
1868
+ "movi v0.4s, #0x0\n"
1869
+ "ldr q31, [x24, #0x20]\n"
1870
+ "ldr q14, [x24, #0x30]\n"
1871
+ "movi v29.4s, #0x0\n"
1872
+ "movi v22.16b, #0xf0\n"
1873
+ "ldr q11, [x25, #0x20]\n"
1874
+ "ldr q23, [x25, #0x30]\n"
1875
+ "sshl v21.16b, v6.16b, v17.16b\n"
1876
+ "sshl v16.16b, v5.16b, v17.16b\n"
1877
+ "ldr q20, [x25, #0x40]\n"
1878
+ "ldr q26, [x25, #0x50]\n"
1879
+ "and v6.16b, v6.16b, v22.16b\n"
1880
+ "and v5.16b, v5.16b, v22.16b\n"
1881
+ "ldr q25, [x25, #0x60]\n"
1882
+ "ldr q3, [x25, #0x70]\n"
1883
+ "sshl v19.16b, v31.16b, v17.16b\n"
1884
+ "sshl v18.16b, v14.16b, v17.16b\n"
1885
+ "ldr d17, [x25, #-0x8]\n"
1886
+ ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1887
+ ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1888
+ "and v31.16b, v31.16b, v22.16b\n"
1889
+ ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1890
+ ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1891
+ "and v14.16b, v14.16b, v22.16b\n"
1892
+ "sub x20, x24, #0x8\n"
1893
+ "ldr d16, [x20, #0x0]\n"
1894
+ "subs x21, x21, #0x1\n"
1895
+ "add x25, x25, #0x88\n"
1896
+ "fcvtl v17.4s, v17.4h\n"
1897
+ "add x24, x24, #0x48\n"
1898
+ ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1899
+ ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1900
+ ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1901
+ ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1902
+ "fcvtl v16.4s, v16.4h\n"
1903
+ ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1904
+ ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1905
+ "fmul v23.4s, v16.4s, v17.s[0]\n"
1906
+ "fmul v21.4s, v16.4s, v17.s[1]\n"
1907
+ "fmul v1.4s, v16.4s, v17.s[2]\n"
1908
+ "fmul v20.4s, v16.4s, v17.s[3]\n"
1909
+ ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1910
+ ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1911
+ ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1912
+ ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1913
+ ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1914
+ ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1915
+ "uzp1 v19.2d, v8.2d, v27.2d\n"
1916
+ "uzp2 v18.2d, v8.2d, v27.2d\n"
1917
+ "scvtf v19.4s, v19.4s, #0x4\n"
1918
+ "uzp1 v17.2d, v0.2d, v29.2d\n"
1919
+ "uzp2 v16.2d, v0.2d, v29.2d\n"
1920
+ "scvtf v18.4s, v18.4s, #0x4\n"
1921
+ "fmla v2.4s, v19.4s, v23.4s\n"
1922
+ "scvtf v17.4s, v17.4s, #0x4\n"
1923
+ "scvtf v16.4s, v16.4s, #0x4\n"
1924
+ "fmla v10.4s, v18.4s, v21.4s\n"
1925
+ "fmla v12.4s, v17.4s, v1.4s\n"
1926
+ "fmla v28.4s, v16.4s, v20.4s\n"
1927
+ "bgt 7b\n"
1928
+ "mov x20, %x[res_ptr]\n"
1929
+ "cmp x10, #0x1\n"
1930
+ "str q2, [x20, #0x0]\n"
1931
+ "add x20, x20, %x[res_stride]\n"
1932
+ "ble 8f\n"
1933
+ "cmp x10, #0x2\n"
1934
+ "str q10, [x20, #0x0]\n"
1935
+ "add x20, x20, %x[res_stride]\n"
1936
+ "ble 8f\n"
1937
+ "cmp x10, #0x3\n"
1938
+ "str q12, [x20, #0x0]\n"
1939
+ "add x20, x20, %x[res_stride]\n"
1940
+ "ble 8f\n"
1941
+ "str q28, [x20, #0x0]\n"
1942
+ "8:" // Row tail: Accumulator store skip
1943
+ "subs x23, x23, #0x4\n"
1944
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1945
+ "bne 6b\n"
1946
+ "subs x10, x10, #0x4\n"
1947
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1948
+ "mov %x[res_ptr], x22\n"
1949
+ "bgt 5b\n"
1950
+ "9:" // Row tail: Row loop skip
1951
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1952
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1953
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1954
+ );
1955
+ return;
1272
1956
  }
1273
- #endif
1274
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1275
- const void * b_ptr = vx;
1276
- const void * a_ptr = vy;
1277
- float * res_ptr = s;
1278
- size_t res_stride = bs * sizeof(float);
1279
-
1280
- __asm__ __volatile__(
1281
- "mov x10, %x[nr]\n"
1282
- "mov x9, #0x88\n"
1283
- "cmp x10, #0x10\n"
1284
- "mul x9, %x[nb], x9\n"
1285
- "blt 4f\n"
1286
- "1:" // Row loop
1287
- "add x28, %x[b_ptr], #0x8\n"
1288
- "mov x27, %x[nc]\n"
1289
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1290
- "2:" // Column loop
1291
- "add x25, %x[a_ptr], #0x8\n"
1292
- "movi v2.16b, #0x0\n"
1293
- "movi v10.16b, #0x0\n"
1294
- "mov x24, %x[nb]\n"
1295
- "add x23, x25, x9\n"
1296
- "movi v12.16b, #0x0\n"
1297
- "movi v28.16b, #0x0\n"
1298
- "add x22, x23, x9\n"
1299
- "movi v11.16b, #0x0\n"
1300
- "movi v13.16b, #0x0\n"
1301
- "add x21, x22, x9\n"
1302
- "movi v22.16b, #0x0\n"
1303
- "movi v23.16b, #0x0\n"
1304
- "movi v25.16b, #0x0\n"
1305
- "movi v5.16b, #0x0\n"
1306
- "movi v7.16b, #0x0\n"
1307
- "movi v4.16b, #0x0\n"
1308
- "movi v6.16b, #0x0\n"
1309
- "movi v30.16b, #0x0\n"
1310
- "movi v24.16b, #0x0\n"
1311
- "movi v14.16b, #0x0\n"
1312
- "3:" // Block loop
1313
- "ldr q21, [x28, #0x0]\n"
1314
- "ldr q16, [x28, #0x10]\n"
1315
- "movi v1.16b, #0x4\n"
1316
- "movi v19.4s, #0x0\n"
1317
- "ldr q27, [x25, #0x0]\n"
1318
- "ldr q15, [x25, #0x10]\n"
1319
- "movi v26.4s, #0x0\n"
1320
- "movi v18.4s, #0x0\n"
1321
- "ldr q29, [x28, #0x20]\n"
1322
- "ldr q3, [x28, #0x30]\n"
1323
- "movi v17.4s, #0x0\n"
1324
- "movi v0.16b, #0xf0\n"
1325
- "ldr d20, [x25, #-0x8]\n"
1326
- "ldr d9, [x23, #-0x8]\n"
1327
- "sshl v8.16b, v21.16b, v1.16b\n"
1328
- "sshl v31.16b, v16.16b, v1.16b\n"
1329
- "and v21.16b, v21.16b, v0.16b\n"
1330
- "and v16.16b, v16.16b, v0.16b\n"
1331
- "sub x20, x28, #0x8\n"
1332
- "subs x24, x24, #0x1\n"
1333
- "add x28, x28, #0x48\n"
1334
- ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1335
- ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1336
- "ldr q27, [x25, #0x20]\n"
1337
- ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1338
- ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1339
- "sshl v15.16b, v29.16b, v1.16b\n"
1340
- "sshl v1.16b, v3.16b, v1.16b\n"
1341
- "and v29.16b, v29.16b, v0.16b\n"
1342
- "and v3.16b, v3.16b, v0.16b\n"
1343
- "ldr q0, [x25, #0x30]\n"
1344
- "fcvtl v20.4s, v20.4h\n"
1345
- ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1346
- "fcvtl v9.4s, v9.4h\n"
1347
- ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1348
- "ldr q27, [x25, #0x40]\n"
1349
- ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1350
- ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1351
- "ldr q0, [x25, #0x50]\n"
1352
- ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1353
- ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1354
- "ldr q27, [x25, #0x60]\n"
1355
- ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1356
- ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1357
- "ldr q0, [x25, #0x70]\n"
1358
- "add x25, x25, #0x88\n"
1359
- ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1360
- ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1361
- "ldr d27, [x20, #0x0]\n"
1362
- ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1363
- ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1364
- "fcvtl v27.4s, v27.4h\n"
1365
- "uzp1 v0.2d, v19.2d, v26.2d\n"
1366
- "uzp2 v26.2d, v19.2d, v26.2d\n"
1367
- "fmul v19.4s, v27.4s, v20.s[0]\n"
1368
- "scvtf v0.4s, v0.4s, #0x4\n"
1369
- "scvtf v26.4s, v26.4s, #0x4\n"
1370
- "fmla v2.4s, v0.4s, v19.4s\n"
1371
- "ldr q19, [x23, #0x0]\n"
1372
- "uzp1 v0.2d, v18.2d, v17.2d\n"
1373
- "uzp2 v18.2d, v18.2d, v17.2d\n"
1374
- "fmul v17.4s, v27.4s, v20.s[1]\n"
1375
- "scvtf v0.4s, v0.4s, #0x4\n"
1376
- "scvtf v18.4s, v18.4s, #0x4\n"
1377
- "fmla v10.4s, v26.4s, v17.4s\n"
1378
- "ldr q17, [x23, #0x10]\n"
1379
- "fmul v26.4s, v27.4s, v20.s[2]\n"
1380
- "fmul v20.4s, v27.4s, v20.s[3]\n"
1381
- "fmla v12.4s, v0.4s, v26.4s\n"
1382
- "ldr d0, [x22, #-0x8]\n"
1383
- "ldr d26, [x21, #-0x8]\n"
1384
- "fcvtl v0.4s, v0.4h\n"
1385
- "fmla v28.4s, v18.4s, v20.4s\n"
1386
- "movi v20.4s, #0x0\n"
1387
- "movi v18.4s, #0x0\n"
1388
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1389
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1390
- "ldr q19, [x23, #0x20]\n"
1391
- "fcvtl v26.4s, v26.4h\n"
1392
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1393
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1394
- "ldr q19, [x23, #0x40]\n"
1395
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1396
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1397
- "ldr q19, [x23, #0x60]\n"
1398
- ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1399
- ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1400
- "uzp1 v19.2d, v20.2d, v18.2d\n"
1401
- "scvtf v19.4s, v19.4s, #0x4\n"
1402
- "uzp2 v20.2d, v20.2d, v18.2d\n"
1403
- "fmul v18.4s, v27.4s, v9.s[0]\n"
1404
- "scvtf v20.4s, v20.4s, #0x4\n"
1405
- "fmla v11.4s, v19.4s, v18.4s\n"
1406
- "ldr q18, [x22, #0x0]\n"
1407
- "fmul v19.4s, v27.4s, v9.s[1]\n"
1408
- "fmla v13.4s, v20.4s, v19.4s\n"
1409
- "movi v19.4s, #0x0\n"
1410
- "movi v20.4s, #0x0\n"
1411
- ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1412
- ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1413
- "ldr q17, [x23, #0x30]\n"
1414
- ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1415
- ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1416
- "ldr q17, [x23, #0x50]\n"
1417
- ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1418
- ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1419
- "ldr q17, [x23, #0x70]\n"
1420
- "add x23, x23, #0x88\n"
1421
- ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1422
- ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1423
- "uzp1 v17.2d, v19.2d, v20.2d\n"
1424
- "scvtf v17.4s, v17.4s, #0x4\n"
1425
- "uzp2 v20.2d, v19.2d, v20.2d\n"
1426
- "fmul v19.4s, v27.4s, v9.s[2]\n"
1427
- "fmul v9.4s, v27.4s, v9.s[3]\n"
1428
- "scvtf v20.4s, v20.4s, #0x4\n"
1429
- "fmla v22.4s, v17.4s, v19.4s\n"
1430
- "ldr q17, [x22, #0x10]\n"
1431
- "movi v19.4s, #0x0\n"
1432
- ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1433
- "fmla v23.4s, v20.4s, v9.4s\n"
1434
- "movi v20.4s, #0x0\n"
1435
- "movi v9.4s, #0x0\n"
1436
- ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1437
- "ldr q18, [x22, #0x20]\n"
1438
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1439
- ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1440
- ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1441
- "ldr q18, [x22, #0x40]\n"
1442
- ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1443
- ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1444
- "ldr q18, [x22, #0x60]\n"
1445
- ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1446
- ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1447
- "movi v18.4s, #0x0\n"
1448
- ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1449
- "ldr q17, [x22, #0x30]\n"
1450
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1451
- ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1452
- "ldr q17, [x22, #0x50]\n"
1453
- ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1454
- ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1455
- "ldr q17, [x22, #0x70]\n"
1456
- "add x22, x22, #0x88\n"
1457
- ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1458
- ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1459
- "uzp1 v17.2d, v19.2d, v20.2d\n"
1460
- "uzp2 v20.2d, v19.2d, v20.2d\n"
1461
- "fmul v19.4s, v27.4s, v0.s[0]\n"
1462
- "scvtf v17.4s, v17.4s, #0x4\n"
1463
- "scvtf v20.4s, v20.4s, #0x4\n"
1464
- "fmla v25.4s, v17.4s, v19.4s\n"
1465
- "ldr q19, [x21, #0x0]\n"
1466
- "fmul v17.4s, v27.4s, v0.s[1]\n"
1467
- "fmla v5.4s, v20.4s, v17.4s\n"
1468
- "ldr q17, [x21, #0x10]\n"
1469
- "uzp1 v20.2d, v9.2d, v18.2d\n"
1470
- "uzp2 v9.2d, v9.2d, v18.2d\n"
1471
- "fmul v18.4s, v27.4s, v0.s[2]\n"
1472
- "fmul v0.4s, v27.4s, v0.s[3]\n"
1473
- "scvtf v20.4s, v20.4s, #0x4\n"
1474
- "scvtf v9.4s, v9.4s, #0x4\n"
1475
- "fmla v7.4s, v20.4s, v18.4s\n"
1476
- "movi v20.4s, #0x0\n"
1477
- "movi v18.4s, #0x0\n"
1478
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1479
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1480
- "ldr q19, [x21, #0x20]\n"
1481
- "fmla v4.4s, v9.4s, v0.4s\n"
1482
- "movi v9.4s, #0x0\n"
1483
- "movi v0.4s, #0x0\n"
1484
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1485
- "fmul v8.4s, v27.4s, v26.s[0]\n"
1486
- ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1487
- "ldr q17, [x21, #0x30]\n"
1488
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1489
- "fmul v31.4s, v27.4s, v26.s[1]\n"
1490
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1491
- "ldr q19, [x21, #0x40]\n"
1492
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1493
- "fmul v15.4s, v27.4s, v26.s[2]\n"
1494
- "fmul v27.4s, v27.4s, v26.s[3]\n"
1495
- ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1496
- "ldr q1, [x21, #0x50]\n"
1497
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1498
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1499
- "ldr q26, [x21, #0x60]\n"
1500
- ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1501
- ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1502
- "ldr q21, [x21, #0x70]\n"
1503
- "add x21, x21, #0x88\n"
1504
- ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1505
- ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1506
- ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1507
- ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1508
- "uzp1 v29.2d, v20.2d, v18.2d\n"
1509
- "uzp2 v21.2d, v20.2d, v18.2d\n"
1510
- "scvtf v29.4s, v29.4s, #0x4\n"
1511
- "uzp1 v18.2d, v9.2d, v0.2d\n"
1512
- "uzp2 v16.2d, v9.2d, v0.2d\n"
1513
- "scvtf v21.4s, v21.4s, #0x4\n"
1514
- "fmla v6.4s, v29.4s, v8.4s\n"
1515
- "scvtf v18.4s, v18.4s, #0x4\n"
1516
- "scvtf v16.4s, v16.4s, #0x4\n"
1517
- "fmla v30.4s, v21.4s, v31.4s\n"
1518
- "fmla v24.4s, v18.4s, v15.4s\n"
1519
- "fmla v14.4s, v16.4s, v27.4s\n"
1520
- "bgt 3b\n"
1521
- "mov x20, %x[res_ptr]\n"
1522
- "subs x27, x27, #0x4\n"
1523
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1524
- "str q2, [x20, #0x0]\n"
1525
- "add x20, x20, %x[res_stride]\n"
1526
- "str q10, [x20, #0x0]\n"
1527
- "add x20, x20, %x[res_stride]\n"
1528
- "str q12, [x20, #0x0]\n"
1529
- "add x20, x20, %x[res_stride]\n"
1530
- "str q28, [x20, #0x0]\n"
1531
- "add x20, x20, %x[res_stride]\n"
1532
- "str q11, [x20, #0x0]\n"
1533
- "add x20, x20, %x[res_stride]\n"
1534
- "str q13, [x20, #0x0]\n"
1535
- "add x20, x20, %x[res_stride]\n"
1536
- "str q22, [x20, #0x0]\n"
1537
- "add x20, x20, %x[res_stride]\n"
1538
- "str q23, [x20, #0x0]\n"
1539
- "add x20, x20, %x[res_stride]\n"
1540
- "str q25, [x20, #0x0]\n"
1541
- "add x20, x20, %x[res_stride]\n"
1542
- "str q5, [x20, #0x0]\n"
1543
- "add x20, x20, %x[res_stride]\n"
1544
- "str q7, [x20, #0x0]\n"
1545
- "add x20, x20, %x[res_stride]\n"
1546
- "str q4, [x20, #0x0]\n"
1547
- "add x20, x20, %x[res_stride]\n"
1548
- "str q6, [x20, #0x0]\n"
1549
- "add x20, x20, %x[res_stride]\n"
1550
- "str q30, [x20, #0x0]\n"
1551
- "add x20, x20, %x[res_stride]\n"
1552
- "str q24, [x20, #0x0]\n"
1553
- "add x20, x20, %x[res_stride]\n"
1554
- "str q14, [x20, #0x0]\n"
1555
- "bne 2b\n"
1556
- "mov x20, #0x4\n"
1557
- "sub x10, x10, #0x10\n"
1558
- "cmp x10, #0x10\n"
1559
- "mov %x[res_ptr], x26\n"
1560
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1561
- "bge 1b\n"
1562
- "4:" // Row loop skip
1563
- "cbz x10, 9f\n"
1564
- "5:" // Row tail: Row loop
1565
- "add x24, %x[b_ptr], #0x8\n"
1566
- "mov x23, %x[nc]\n"
1567
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1568
- "6:" // Row tail: Column loop
1569
- "movi v2.16b, #0x0\n"
1570
- "movi v10.16b, #0x0\n"
1571
- "add x25, %x[a_ptr], #0x8\n"
1572
- "mov x21, %x[nb]\n"
1573
- "movi v12.16b, #0x0\n"
1574
- "movi v28.16b, #0x0\n"
1575
- "7:" // Row tail: Block loop
1576
- "ldr q6, [x24, #0x0]\n"
1577
- "ldr q5, [x24, #0x10]\n"
1578
- "movi v17.16b, #0x4\n"
1579
- "movi v8.4s, #0x0\n"
1580
- "ldr q4, [x25, #0x0]\n"
1581
- "ldr q13, [x25, #0x10]\n"
1582
- "movi v27.4s, #0x0\n"
1583
- "movi v0.4s, #0x0\n"
1584
- "ldr q31, [x24, #0x20]\n"
1585
- "ldr q14, [x24, #0x30]\n"
1586
- "movi v29.4s, #0x0\n"
1587
- "movi v22.16b, #0xf0\n"
1588
- "ldr q11, [x25, #0x20]\n"
1589
- "ldr q23, [x25, #0x30]\n"
1590
- "sshl v21.16b, v6.16b, v17.16b\n"
1591
- "sshl v16.16b, v5.16b, v17.16b\n"
1592
- "ldr q20, [x25, #0x40]\n"
1593
- "ldr q26, [x25, #0x50]\n"
1594
- "and v6.16b, v6.16b, v22.16b\n"
1595
- "and v5.16b, v5.16b, v22.16b\n"
1596
- "ldr q25, [x25, #0x60]\n"
1597
- "ldr q3, [x25, #0x70]\n"
1598
- "sshl v19.16b, v31.16b, v17.16b\n"
1599
- "sshl v18.16b, v14.16b, v17.16b\n"
1600
- "ldr d17, [x25, #-0x8]\n"
1601
- ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1602
- ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1603
- "and v31.16b, v31.16b, v22.16b\n"
1604
- ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1605
- ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1606
- "and v14.16b, v14.16b, v22.16b\n"
1607
- "sub x20, x24, #0x8\n"
1608
- "ldr d16, [x20, #0x0]\n"
1609
- "subs x21, x21, #0x1\n"
1610
- "add x25, x25, #0x88\n"
1611
- "fcvtl v17.4s, v17.4h\n"
1612
- "add x24, x24, #0x48\n"
1613
- ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1614
- ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1615
- ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1616
- ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1617
- "fcvtl v16.4s, v16.4h\n"
1618
- ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1619
- ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1620
- "fmul v23.4s, v16.4s, v17.s[0]\n"
1621
- "fmul v21.4s, v16.4s, v17.s[1]\n"
1622
- "fmul v1.4s, v16.4s, v17.s[2]\n"
1623
- "fmul v20.4s, v16.4s, v17.s[3]\n"
1624
- ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1625
- ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1626
- ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1627
- ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1628
- ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1629
- ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1630
- "uzp1 v19.2d, v8.2d, v27.2d\n"
1631
- "uzp2 v18.2d, v8.2d, v27.2d\n"
1632
- "scvtf v19.4s, v19.4s, #0x4\n"
1633
- "uzp1 v17.2d, v0.2d, v29.2d\n"
1634
- "uzp2 v16.2d, v0.2d, v29.2d\n"
1635
- "scvtf v18.4s, v18.4s, #0x4\n"
1636
- "fmla v2.4s, v19.4s, v23.4s\n"
1637
- "scvtf v17.4s, v17.4s, #0x4\n"
1638
- "scvtf v16.4s, v16.4s, #0x4\n"
1639
- "fmla v10.4s, v18.4s, v21.4s\n"
1640
- "fmla v12.4s, v17.4s, v1.4s\n"
1641
- "fmla v28.4s, v16.4s, v20.4s\n"
1642
- "bgt 7b\n"
1643
- "mov x20, %x[res_ptr]\n"
1644
- "cmp x10, #0x1\n"
1645
- "str q2, [x20, #0x0]\n"
1646
- "add x20, x20, %x[res_stride]\n"
1647
- "ble 8f\n"
1648
- "cmp x10, #0x2\n"
1649
- "str q10, [x20, #0x0]\n"
1650
- "add x20, x20, %x[res_stride]\n"
1651
- "ble 8f\n"
1652
- "cmp x10, #0x3\n"
1653
- "str q12, [x20, #0x0]\n"
1654
- "add x20, x20, %x[res_stride]\n"
1655
- "ble 8f\n"
1656
- "str q28, [x20, #0x0]\n"
1657
- "8:" // Row tail: Accumulator store skip
1658
- "subs x23, x23, #0x4\n"
1659
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1660
- "bne 6b\n"
1661
- "subs x10, x10, #0x4\n"
1662
- "add %x[a_ptr], %x[a_ptr], x9\n"
1663
- "mov %x[res_ptr], x22\n"
1664
- "bgt 5b\n"
1665
- "9:" // Row tail: Row loop skip
1666
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1667
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1668
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1669
- );
1670
- #elif defined(__ARM_NEON) && defined(__aarch64__)
1671
- GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
1672
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
1673
- "performance");
1674
- #else
1957
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1675
1958
  float sumf[4][4];
1676
1959
  int sumi;
1677
1960
 
@@ -1691,7 +1974,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
1691
1974
  const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1692
1975
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1693
1976
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1694
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1977
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1695
1978
  }
1696
1979
  sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1697
1980
  }
@@ -1704,7 +1987,6 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
1704
1987
  }
1705
1988
  }
1706
1989
  }
1707
- #endif
1708
1990
  }
1709
1991
 
1710
1992
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -1727,8 +2009,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
1727
2009
  UNUSED(ncols_interleaved);
1728
2010
  UNUSED(blocklen);
1729
2011
 
1730
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731
- if (svcntw() == 8) {
2012
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2013
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2014
+ if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
1732
2015
  const void * b_ptr = vx;
1733
2016
  const void * a_ptr = vy;
1734
2017
  float * res_ptr = s;
@@ -2138,25 +2421,759 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
2138
2421
  );
2139
2422
  return;
2140
2423
  }
2141
- else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2142
- GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
2143
- "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2144
- "performance");
2145
- }
2146
- else if (ggml_cpu_has_neon()) {
2147
- GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
2148
- "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2149
- "quantization format for optimal performance");
2424
+ #endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2425
+ #elif defined(__AVX2__) || defined(__AVX512F__)
2426
+ {
2427
+ const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
2428
+ const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
2429
+ int64_t b_nb = n / QK4_0;
2430
+ int64_t y = 0;
2431
+ // Mask to mask out nibbles from packed bytes
2432
+ const __m256i m4b = _mm256_set1_epi8(0x0F);
2433
+ const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
2434
+ // Lookup table to convert signed nibbles to signed bytes
2435
+ __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
2436
+ signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
2437
+ // Permute mask used for easier vector processing at later stages
2438
+ __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
2439
+ int64_t xstart = 0;
2440
+ int anr = nr - nr%16; // Used to align nr with boundary of 16
2441
+ #ifdef __AVX512F__
2442
+ int anc = nc - nc%16; // Used to align nc with boundary of 16
2443
+ // Mask to mask out nibbles from packed bytes expanded to 512 bit length
2444
+ const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
2445
+ // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
2446
+ __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
2447
+
2448
+ // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
2449
+ for (; y < anr / 4; y += 4) {
2450
+
2451
+ const block_q8_0x4 * a_ptrs[4];
2452
+
2453
+ a_ptrs[0] = a_ptr_start + (y * nb);
2454
+ for (int i = 0; i < 3; ++i) {
2455
+ a_ptrs[i + 1] = a_ptrs[i] + nb;
2456
+ }
2457
+
2458
+ // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
2459
+ for (int64_t x = 0; x < anc / 8; x += 2) {
2460
+
2461
+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
2462
+ const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
2463
+
2464
+ // Master FP accumulators
2465
+ __m512 acc_rows[16];
2466
+ for (int i = 0; i < 16; i++) {
2467
+ acc_rows[i] = _mm512_setzero_ps();
2468
+ }
2469
+
2470
+ for (int64_t b = 0; b < nb; b++) {
2471
+ // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
2472
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
2473
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
2474
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
2475
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
2476
+
2477
+ const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
2478
+ const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
2479
+ const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
2480
+ const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
2481
+
2482
+ // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
2483
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2484
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2485
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2486
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2487
+
2488
+ const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
2489
+ const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
2490
+ const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
2491
+ const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
2492
+
2493
+ const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
2494
+ const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
2495
+ const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
2496
+ const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
2497
+
2498
+ // 4-bit -> 8-bit - Sign is maintained
2499
+ const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
2500
+ const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
2501
+
2502
+ const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
2503
+ const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
2504
+
2505
+ const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
2506
+ const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
2507
+
2508
+ const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
2509
+ const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
2510
+
2511
+ // Shuffle pattern one - right side input
2512
+ const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
2513
+ const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
2514
+
2515
+ const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
2516
+ const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
2517
+
2518
+ const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
2519
+ const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
2520
+
2521
+ const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
2522
+ const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
2523
+
2524
+ // Shuffle pattern two - right side input
2525
+
2526
+ const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
2527
+ const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
2528
+
2529
+ const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
2530
+ const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
2531
+
2532
+ const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
2533
+ const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
2534
+
2535
+ const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
2536
+ const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
2537
+
2538
+ // Scale values - Load the weight scale values of two block_q4_0x8
2539
+ const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
2540
+
2541
+ // Process LHS in pairs of rows
2542
+ for (int rp = 0; rp < 4; rp++) {
2543
+
2544
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2545
+ // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
2546
+ __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
2547
+ __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
2548
+ __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
2549
+ __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
2550
+ __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
2551
+ __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
2552
+ __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
2553
+ __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
2554
+ __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
2555
+ __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
2556
+ __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
2557
+ __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
2558
+
2559
+ __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
2560
+ __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
2561
+ __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
2562
+ __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
2563
+ __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
2564
+ __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
2565
+ __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
2566
+ __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
2567
+
2568
+ // Shuffle pattern one - left side input
2569
+
2570
+ const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2571
+ const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2572
+
2573
+ const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2574
+ const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2575
+
2576
+ const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2577
+ const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2578
+
2579
+ const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2580
+ const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2581
+
2582
+ // Shuffle pattern two - left side input
2583
+
2584
+ const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2585
+ const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2586
+
2587
+ const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2588
+ const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2589
+
2590
+ const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2591
+ const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2592
+
2593
+ const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2594
+ const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2595
+
2596
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2597
+ // Resembles MMLAs into 2x2 matrices in ARM Version
2598
+ __m512i iacc_mat_00_sp1 =
2599
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
2600
+ __m512i iacc_mat_01_sp1 =
2601
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
2602
+ __m512i iacc_mat_10_sp1 =
2603
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
2604
+ __m512i iacc_mat_11_sp1 =
2605
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
2606
+ __m512i iacc_mat_00_sp2 =
2607
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
2608
+ __m512i iacc_mat_01_sp2 =
2609
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
2610
+ __m512i iacc_mat_10_sp2 =
2611
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
2612
+ __m512i iacc_mat_11_sp2 =
2613
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
2614
+
2615
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2616
+ __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2617
+ __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2618
+ __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2619
+ __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2620
+
2621
+
2622
+ // Straighten out to make 4 row vectors
2623
+ __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
2624
+ __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
2625
+ __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
2626
+ __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
2627
+
2628
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2629
+ const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
2630
+ const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
2631
+
2632
+ // Multiply with appropiate scales and accumulate
2633
+ acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
2634
+ acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
2635
+ acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
2636
+ acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
2637
+ }
2638
+ }
2639
+
2640
+ // Store the accumulated values
2641
+ for (int i = 0; i < 16; i++) {
2642
+ _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
2643
+ }
2644
+ }
2645
+ }
2646
+ // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
2647
+ for (; y < nr / 4; y ++) {
2648
+
2649
+ const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
2650
+
2651
+ // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
2652
+ for (int64_t x = 0; x < anc / 8; x += 2) {
2653
+
2654
+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
2655
+ const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
2656
+
2657
+ // Master FP accumulators
2658
+ __m512 acc_rows[4];
2659
+ for (int i = 0; i < 4; i++) {
2660
+ acc_rows[i] = _mm512_setzero_ps();
2661
+ }
2662
+
2663
+ for (int64_t b = 0; b < nb; b++) {
2664
+ // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
2665
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
2666
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
2667
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
2668
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
2669
+
2670
+ const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
2671
+ const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
2672
+ const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
2673
+ const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
2674
+
2675
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
2676
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2677
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2678
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2679
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2680
+
2681
+ const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
2682
+ const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
2683
+ const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
2684
+ const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
2685
+
2686
+ const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
2687
+ const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
2688
+ const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
2689
+ const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
2690
+
2691
+ // 4-bit -> 8-bit - Sign is maintained
2692
+ const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
2693
+ const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
2694
+
2695
+ const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
2696
+ const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
2697
+
2698
+ const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
2699
+ const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
2700
+
2701
+ const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
2702
+ const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
2703
+
2704
+ // Shuffle pattern one - right side input
2705
+ const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
2706
+ const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
2707
+
2708
+ const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
2709
+ const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
2710
+
2711
+ const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
2712
+ const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
2713
+
2714
+ const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
2715
+ const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
2716
+
2717
+ // Shuffle pattern two - right side input
2718
+
2719
+ const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
2720
+ const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
2721
+
2722
+ const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
2723
+ const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
2724
+
2725
+ const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
2726
+ const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
2727
+
2728
+ const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
2729
+ const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
2730
+
2731
+
2732
+ // Scale values - Load the weight scale values of two block_q4_0x8
2733
+ const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
2734
+
2735
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2736
+ // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
2737
+ __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
2738
+ __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
2739
+ __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
2740
+ __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
2741
+ __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
2742
+ __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
2743
+ __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
2744
+ __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
2745
+ __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
2746
+ __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
2747
+ __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
2748
+ __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
2749
+
2750
+ __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
2751
+ __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
2752
+ __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
2753
+ __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
2754
+ __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
2755
+ __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
2756
+ __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
2757
+ __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
2758
+
2759
+ // Shuffle pattern one - left side input
2760
+
2761
+ const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2762
+ const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2763
+
2764
+ const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2765
+ const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2766
+
2767
+ const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2768
+ const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2769
+
2770
+ const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2771
+ const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2772
+
2773
+ // Shuffle pattern two - left side input
2774
+
2775
+ const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2776
+ const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2777
+
2778
+ const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2779
+ const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2780
+
2781
+ const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2782
+ const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2783
+
2784
+ const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2785
+ const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2786
+
2787
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2788
+ // Resembles MMLAs into 2x2 matrices in ARM Version
2789
+ __m512i iacc_mat_00_sp1 =
2790
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
2791
+ __m512i iacc_mat_01_sp1 =
2792
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
2793
+ __m512i iacc_mat_10_sp1 =
2794
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
2795
+ __m512i iacc_mat_11_sp1 =
2796
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
2797
+ __m512i iacc_mat_00_sp2 =
2798
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
2799
+ __m512i iacc_mat_01_sp2 =
2800
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
2801
+ __m512i iacc_mat_10_sp2 =
2802
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
2803
+ __m512i iacc_mat_11_sp2 =
2804
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
2805
+
2806
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2807
+ __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2808
+ __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2809
+ __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2810
+ __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2811
+
2812
+
2813
+ // Straighten out to make 4 row vectors
2814
+ __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
2815
+ __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
2816
+ __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
2817
+ __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
2818
+
2819
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2820
+ const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
2821
+ const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
2822
+
2823
+ // Multiply with appropiate scales and accumulate
2824
+ acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
2825
+ acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
2826
+ acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
2827
+ acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
2828
+ }
2829
+
2830
+ // Store the accumulated values
2831
+ for (int i = 0; i < 4; i++) {
2832
+ _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
2833
+ }
2834
+ }
2835
+ }
2836
+ if (anc != nc) {
2837
+ xstart = anc/8;
2838
+ y = 0;
2839
+ }
2840
+ #endif // __AVX512F__
2841
+
2842
+ // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
2843
+
2844
+ for (; y < anr / 4; y += 4) {
2845
+ const block_q8_0x4 * a_ptrs[4];
2846
+
2847
+ a_ptrs[0] = a_ptr_start + (y * nb);
2848
+ for (int i = 0; i < 3; ++i) {
2849
+ a_ptrs[i + 1] = a_ptrs[i] + nb;
2850
+ }
2851
+
2852
+ // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
2853
+ for (int64_t x = xstart; x < nc / 8; x++) {
2854
+
2855
+ const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
2856
+
2857
+ // Master FP accumulators
2858
+ __m256 acc_rows[16];
2859
+ for (int i = 0; i < 16; i++) {
2860
+ acc_rows[i] = _mm256_setzero_ps();
2861
+ }
2862
+
2863
+ for (int64_t b = 0; b < nb; b++) {
2864
+ // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2865
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
2866
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
2867
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
2868
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
2869
+
2870
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
2871
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2872
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2873
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2874
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2875
+
2876
+ // 4-bit -> 8-bit - Sign is maintained
2877
+ const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
2878
+ const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
2879
+
2880
+ const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
2881
+ const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
2882
+
2883
+ const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
2884
+ const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
2885
+
2886
+ const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
2887
+ const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
2888
+
2889
+ // Shuffle pattern one - right side input
2890
+ const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
2891
+ const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
2892
+
2893
+ const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
2894
+ const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
2895
+
2896
+ const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
2897
+ const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
2898
+
2899
+ const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
2900
+ const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
2901
+
2902
+ // Shuffle pattern two - right side input
2903
+
2904
+ const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
2905
+ const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
2906
+
2907
+ const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
2908
+ const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
2909
+
2910
+ const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
2911
+ const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
2912
+
2913
+ const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
2914
+ const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
2915
+
2916
+ // Scale values - Load the wight scale values of block_q4_0x8
2917
+ const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
2918
+
2919
+ // Process LHS in groups of four
2920
+ for (int rp = 0; rp < 4; rp++) {
2921
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2922
+ // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
2923
+ __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
2924
+ __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
2925
+ __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
2926
+ __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
2927
+ __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
2928
+ __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
2929
+ __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
2930
+ __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
2931
+ __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
2932
+ __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
2933
+ __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
2934
+ __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
2935
+
2936
+ // Shuffle pattern one - left side input
2937
+ const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2938
+ const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2939
+
2940
+ const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2941
+ const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2942
+
2943
+ const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2944
+ const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2945
+
2946
+ const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2947
+ const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2948
+
2949
+ // Shuffle pattern two - left side input
2950
+ const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2951
+ const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2952
+
2953
+ const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2954
+ const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2955
+
2956
+ const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2957
+ const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2958
+
2959
+ const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2960
+ const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2961
+
2962
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2963
+ // Resembles MMLAs into 2x2 matrices in ARM Version
2964
+ __m256i iacc_mat_00_sp1 =
2965
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
2966
+ __m256i iacc_mat_01_sp1 =
2967
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
2968
+ __m256i iacc_mat_10_sp1 =
2969
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
2970
+ __m256i iacc_mat_11_sp1 =
2971
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
2972
+ __m256i iacc_mat_00_sp2 =
2973
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
2974
+ __m256i iacc_mat_01_sp2 =
2975
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
2976
+ __m256i iacc_mat_10_sp2 =
2977
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
2978
+ __m256i iacc_mat_11_sp2 =
2979
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
2980
+
2981
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2982
+ __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2983
+ __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2984
+ __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2985
+ __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2986
+
2987
+ // Straighten out to make 4 row vectors
2988
+ __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
2989
+ __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
2990
+ __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
2991
+ __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
2992
+
2993
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2994
+ const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
2995
+
2996
+ // Multiply with appropiate scales and accumulate
2997
+ acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
2998
+ acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
2999
+ acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
3000
+ acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
3001
+ }
3002
+ }
3003
+
3004
+ // Store the accumulated values
3005
+ for (int i = 0; i < 16; i++) {
3006
+ _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
3007
+ }
3008
+ }
3009
+ }
3010
+
3011
+ // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
3012
+ for (; y < nr / 4; y ++) {
3013
+
3014
+ const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
3015
+
3016
+ // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
3017
+ for (int64_t x = xstart; x < nc / 8; x++) {
3018
+
3019
+ const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
3020
+
3021
+ // Master FP accumulators
3022
+ __m256 acc_rows[4];
3023
+ for (int i = 0; i < 4; i++) {
3024
+ acc_rows[i] = _mm256_setzero_ps();
3025
+ }
3026
+
3027
+ for (int64_t b = 0; b < nb; b++) {
3028
+ // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
3029
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
3030
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
3031
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
3032
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
3033
+
3034
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
3035
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
3036
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
3037
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
3038
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
3039
+
3040
+ // 4-bit -> 8-bit - Sign is maintained
3041
+ const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
3042
+ const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
3043
+
3044
+ const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
3045
+ const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
3046
+
3047
+ const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
3048
+ const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
3049
+
3050
+ const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
3051
+ const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
3052
+
3053
+ // Shuffle pattern one - right side input
3054
+ const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
3055
+ const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
3056
+
3057
+ const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
3058
+ const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
3059
+
3060
+ const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
3061
+ const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
3062
+
3063
+ const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
3064
+ const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
3065
+
3066
+ // Shuffle pattern two - right side input
3067
+
3068
+ const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
3069
+ const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
3070
+
3071
+ const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
3072
+ const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
3073
+
3074
+ const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
3075
+ const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
3076
+
3077
+ const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
3078
+ const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
3079
+
3080
+ // Scale values - Load the wight scale values of block_q4_0x8
3081
+ const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
3082
+
3083
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
3084
+ // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
3085
+ __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
3086
+ __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
3087
+ __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
3088
+ __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
3089
+ __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
3090
+ __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
3091
+ __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
3092
+ __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
3093
+ __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
3094
+ __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
3095
+ __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
3096
+ __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
3097
+
3098
+ // Shuffle pattern one - left side input
3099
+
3100
+ const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
3101
+ const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
3102
+
3103
+ const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
3104
+ const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
3105
+
3106
+ const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
3107
+ const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
3108
+
3109
+ const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
3110
+ const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
3111
+
3112
+ // Shuffle pattern two - left side input
3113
+
3114
+ const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
3115
+ const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
3116
+
3117
+ const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
3118
+ const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
3119
+
3120
+ const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
3121
+ const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
3122
+
3123
+ const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
3124
+ const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
3125
+
3126
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
3127
+ // Resembles MMLAs into 2x2 matrices in ARM Version
3128
+ __m256i iacc_mat_00_sp1 =
3129
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
3130
+ __m256i iacc_mat_01_sp1 =
3131
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
3132
+ __m256i iacc_mat_10_sp1 =
3133
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
3134
+ __m256i iacc_mat_11_sp1 =
3135
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
3136
+ __m256i iacc_mat_00_sp2 =
3137
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
3138
+ __m256i iacc_mat_01_sp2 =
3139
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
3140
+ __m256i iacc_mat_10_sp2 =
3141
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
3142
+ __m256i iacc_mat_11_sp2 =
3143
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
3144
+
3145
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
3146
+ __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
3147
+ __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
3148
+ __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
3149
+ __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
3150
+
3151
+
3152
+ // Straighten out to make 4 row vectors
3153
+ __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
3154
+ __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
3155
+ __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
3156
+ __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
3157
+
3158
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
3159
+ const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
3160
+
3161
+ // Multiply with appropiate scales and accumulate
3162
+ acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
3163
+ acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
3164
+ acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
3165
+ acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
3166
+ }
3167
+
3168
+ // Store the accumulated values
3169
+ for (int i = 0; i < 4; i++) {
3170
+ _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
3171
+ }
3172
+ }
3173
+ }
3174
+ return;
2150
3175
  }
2151
- #endif
2152
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2153
- GGML_ASSERT(ggml_cpu_has_sve() &&
2154
- "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
2155
- #elif defined(__ARM_NEON) && defined(__aarch64__)
2156
- GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
2157
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
2158
- "performance");
2159
- #else
3176
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2160
3177
  float sumf[4][8];
2161
3178
  int sumi;
2162
3179
 
@@ -2189,5 +3206,4 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
2189
3206
  }
2190
3207
  }
2191
3208
  }
2192
- #endif
2193
3209
  }