@fugood/llama.node 1.0.0-beta.4 → 1.0.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/CMakeLists.txt +7 -4
  2. package/lib/binding.ts +1 -1
  3. package/package.json +14 -14
  4. package/scripts/llama.cpp.patch +27 -26
  5. package/src/LlamaCompletionWorker.cpp +21 -4
  6. package/src/LlamaCompletionWorker.h +2 -0
  7. package/src/LlamaContext.cpp +3 -12
  8. package/src/common.hpp +6 -5
  9. package/src/llama.cpp/CMakeLists.txt +15 -4
  10. package/src/llama.cpp/common/CMakeLists.txt +15 -24
  11. package/src/llama.cpp/common/arg.cpp +172 -110
  12. package/src/llama.cpp/common/chat-parser.cpp +385 -0
  13. package/src/llama.cpp/common/chat-parser.h +120 -0
  14. package/src/llama.cpp/common/chat.cpp +726 -596
  15. package/src/llama.cpp/common/chat.h +74 -8
  16. package/src/llama.cpp/common/common.cpp +56 -38
  17. package/src/llama.cpp/common/common.h +9 -3
  18. package/src/llama.cpp/common/json-partial.cpp +256 -0
  19. package/src/llama.cpp/common/json-partial.h +38 -0
  20. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  21. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
  22. package/src/llama.cpp/common/sampling.cpp +7 -8
  23. package/src/llama.cpp/common/speculative.cpp +6 -4
  24. package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
  25. package/src/llama.cpp/ggml/include/ggml.h +22 -3
  26. package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
  27. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
  28. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  29. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  30. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  43. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  47. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  48. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  49. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  50. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
  52. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  53. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  54. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  55. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  56. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  57. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
  58. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  60. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
  61. package/src/llama.cpp/include/llama.h +145 -40
  62. package/src/llama.cpp/src/CMakeLists.txt +5 -1
  63. package/src/llama.cpp/src/llama-arch.cpp +99 -3
  64. package/src/llama.cpp/src/llama-arch.h +10 -1
  65. package/src/llama.cpp/src/llama-batch.cpp +728 -272
  66. package/src/llama.cpp/src/llama-batch.h +112 -54
  67. package/src/llama.cpp/src/llama-chat.cpp +19 -2
  68. package/src/llama.cpp/src/llama-chat.h +1 -0
  69. package/src/llama.cpp/src/llama-context.cpp +525 -339
  70. package/src/llama.cpp/src/llama-context.h +38 -17
  71. package/src/llama.cpp/src/llama-cparams.cpp +4 -0
  72. package/src/llama.cpp/src/llama-cparams.h +2 -0
  73. package/src/llama.cpp/src/llama-grammar.cpp +12 -2
  74. package/src/llama.cpp/src/llama-graph.cpp +413 -353
  75. package/src/llama.cpp/src/llama-graph.h +112 -56
  76. package/src/llama.cpp/src/llama-hparams.cpp +10 -2
  77. package/src/llama.cpp/src/llama-hparams.h +13 -2
  78. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
  79. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
  80. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
  81. package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
  82. package/src/llama.cpp/src/llama-kv-cells.h +415 -0
  83. package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  84. package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
  85. package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
  86. package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
  87. package/src/llama.cpp/src/llama-memory.cpp +41 -0
  88. package/src/llama.cpp/src/llama-memory.h +86 -5
  89. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  90. package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
  91. package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
  92. package/src/llama.cpp/src/llama-model.cpp +1137 -528
  93. package/src/llama.cpp/src/llama-model.h +4 -0
  94. package/src/llama.cpp/src/llama-quant.cpp +2 -1
  95. package/src/llama.cpp/src/llama-sampling.cpp +2 -2
  96. package/src/llama.cpp/src/llama-vocab.cpp +69 -32
  97. package/src/llama.cpp/src/llama-vocab.h +1 -0
  98. package/src/llama.cpp/src/llama.cpp +11 -7
  99. package/src/llama.cpp/src/unicode.cpp +5 -0
  100. package/src/tts_utils.h +1 -1
  101. package/src/llama.cpp/common/json.hpp +0 -24766
  102. package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
  103. package/src/llama.cpp/common/minja/minja.hpp +0 -2974
  104. package/src/llama.cpp/common/stb_image.h +0 -7988
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  106. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
  107. package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
  108. package/src/llama.cpp/src/llama-kv-cache.h +0 -515
  109. /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  110. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  111. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -0,0 +1,4310 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+
7
+ #include "../../quants.h"
8
+ #include "../../ggml-cpu-impl.h"
9
+
10
+ #include <math.h>
11
+ #include <string.h>
12
+ #include <assert.h>
13
+ #include <stdlib.h> // for qsort
14
+ #include <stdio.h> // for GGML_ASSERT
15
+
16
+ #define GROUP_MAX_EPS 1e-15f
17
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
18
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
19
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
20
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
21
+
22
+ #define UNUSED GGML_UNUSED
23
+
24
+ // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
25
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
26
+
27
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
28
+ // multiply int8_t, add results pairwise twice
29
+ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
30
+ // Get absolute values of x vectors
31
+ const __m128i ax = _mm_sign_epi8(x, x);
32
+ // Sign the values of the y vectors
33
+ const __m128i sy = _mm_sign_epi8(y, x);
34
+ // Perform multiplication and create 16-bit values
35
+ const __m128i dot = _mm_maddubs_epi16(ax, sy);
36
+ const __m128i ones = _mm_set1_epi16(1);
37
+ return _mm_madd_epi16(ones, dot);
38
+ }
39
+
40
+ #if __AVX__ || __AVX2__ || __AVX512F__
41
+ // horizontally add 8 floats
42
+ static inline float hsum_float_8(const __m256 x) {
43
+ __m128 res = _mm256_extractf128_ps(x, 1);
44
+ res = _mm_add_ps(res, _mm256_castps256_ps128(x));
45
+ res = _mm_add_ps(res, _mm_movehl_ps(res, res));
46
+ res = _mm_add_ss(res, _mm_movehdup_ps(res));
47
+ return _mm_cvtss_f32(res);
48
+ }
49
+
50
+ // horizontally add 8 int32_t
51
+ static inline int hsum_i32_8(const __m256i a) {
52
+ const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
53
+ const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
54
+ const __m128i sum64 = _mm_add_epi32(hi64, sum128);
55
+ const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
56
+ return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
57
+ }
58
+
59
+ // horizontally add 4 int32_t
60
+ static inline int hsum_i32_4(const __m128i a) {
61
+ const __m128i hi64 = _mm_unpackhi_epi64(a, a);
62
+ const __m128i sum64 = _mm_add_epi32(hi64, a);
63
+ const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
64
+ return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
65
+ }
66
+
67
+ #if defined(__AVX2__) || defined(__AVX512F__)
68
+ // spread 32 bits to 32 bytes { 0x00, 0xFF }
69
+ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
70
+ uint32_t x32;
71
+ memcpy(&x32, x, sizeof(uint32_t));
72
+ const __m256i shuf_mask = _mm256_set_epi64x(
73
+ 0x0303030303030303, 0x0202020202020202,
74
+ 0x0101010101010101, 0x0000000000000000);
75
+ __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
76
+ const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
77
+ bytes = _mm256_or_si256(bytes, bit_mask);
78
+ return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
79
+ }
80
+
81
+ // Unpack 32 4-bit fields into 32 bytes
82
+ // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
83
+ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
84
+ {
85
+ const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
86
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
87
+ const __m256i lowMask = _mm256_set1_epi8( 0xF );
88
+ return _mm256_and_si256(lowMask, bytes);
89
+ }
90
+
91
+ // add int16_t pairwise and return as float vector
92
+ static inline __m256 sum_i16_pairs_float(const __m256i x) {
93
+ const __m256i ones = _mm256_set1_epi16(1);
94
+ const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
95
+ return _mm256_cvtepi32_ps(summed_pairs);
96
+ }
97
+
98
+ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
99
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
100
+ const __m256i zero = _mm256_setzero_si256();
101
+ const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
102
+ return _mm256_cvtepi32_ps(summed_pairs);
103
+ #elif defined(__AVXVNNI__)
104
+ const __m256i zero = _mm256_setzero_si256();
105
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
106
+ return _mm256_cvtepi32_ps(summed_pairs);
107
+ #else
108
+ // Perform multiplication and create 16-bit values
109
+ const __m256i dot = _mm256_maddubs_epi16(ax, sy);
110
+ return sum_i16_pairs_float(dot);
111
+ #endif
112
+ }
113
+
114
+ // multiply int8_t, add results pairwise twice and return as float vector
115
+ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
116
+ #if __AVXVNNIINT8__
117
+ const __m256i zero = _mm256_setzero_si256();
118
+ const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
119
+ return _mm256_cvtepi32_ps(summed_pairs);
120
+ #else
121
+ // Get absolute values of x vectors
122
+ const __m256i ax = _mm256_sign_epi8(x, x);
123
+ // Sign the values of the y vectors
124
+ const __m256i sy = _mm256_sign_epi8(y, x);
125
+ return mul_sum_us8_pairs_float(ax, sy);
126
+ #endif
127
+ }
128
+
129
+ static inline __m128i packNibbles( __m256i bytes )
130
+ {
131
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
132
+ #if __AVX512F__
133
+ const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
134
+ bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
135
+ return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
136
+ #else
137
+ const __m256i lowByte = _mm256_set1_epi16( 0xFF );
138
+ __m256i high = _mm256_andnot_si256( lowByte, bytes );
139
+ __m256i low = _mm256_and_si256( lowByte, bytes );
140
+ high = _mm256_srli_epi16( high, 4 );
141
+ bytes = _mm256_or_si256( low, high );
142
+
143
+ // Compress uint16_t lanes into bytes
144
+ __m128i r0 = _mm256_castsi256_si128( bytes );
145
+ __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
146
+ return _mm_packus_epi16( r0, r1 );
147
+ #endif
148
+ }
149
+ #elif defined(__AVX__)
150
+ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
151
+ {
152
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
153
+ const __m128i lowByte = _mm_set1_epi16( 0xFF );
154
+ __m128i high = _mm_andnot_si128( lowByte, bytes1 );
155
+ __m128i low = _mm_and_si128( lowByte, bytes1 );
156
+ high = _mm_srli_epi16( high, 4 );
157
+ bytes1 = _mm_or_si128( low, high );
158
+ high = _mm_andnot_si128( lowByte, bytes2 );
159
+ low = _mm_and_si128( lowByte, bytes2 );
160
+ high = _mm_srli_epi16( high, 4 );
161
+ bytes2 = _mm_or_si128( low, high );
162
+
163
+ return _mm_packus_epi16( bytes1, bytes2);
164
+ }
165
+
166
+ static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
167
+ const __m128i ax = _mm_sign_epi8(x, x);
168
+ const __m128i sy = _mm_sign_epi8(y, x);
169
+ return _mm_maddubs_epi16(ax, sy);
170
+ }
171
+
172
+ // spread 32 bits to 32 bytes { 0x00, 0xFF }
173
+ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
174
+ uint32_t x32;
175
+ memcpy(&x32, x, sizeof(uint32_t));
176
+ const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
177
+ const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
178
+ __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
179
+ __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
180
+ const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
181
+ bytesl = _mm_or_si128(bytesl, bit_mask);
182
+ bytesh = _mm_or_si128(bytesh, bit_mask);
183
+ bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
184
+ bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
185
+ return MM256_SET_M128I(bytesh, bytesl);
186
+ }
187
+
188
+ // Unpack 32 4-bit fields into 32 bytes
189
+ // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
190
+ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
191
+ {
192
+ // Load 16 bytes from memory
193
+ __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
194
+ __m128i tmph = _mm_srli_epi16(tmpl, 4);
195
+ const __m128i lowMask = _mm_set1_epi8(0xF);
196
+ tmpl = _mm_and_si128(lowMask, tmpl);
197
+ tmph = _mm_and_si128(lowMask, tmph);
198
+ return MM256_SET_M128I(tmph, tmpl);
199
+ }
200
+
201
+ // add int16_t pairwise and return as float vector
202
+ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
203
+ const __m128i ones = _mm_set1_epi16(1);
204
+ const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
205
+ const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
206
+ const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
207
+ return _mm256_cvtepi32_ps(summed_pairs);
208
+ }
209
+
210
+ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
211
+ const __m128i axl = _mm256_castsi256_si128(ax);
212
+ const __m128i axh = _mm256_extractf128_si256(ax, 1);
213
+ const __m128i syl = _mm256_castsi256_si128(sy);
214
+ const __m128i syh = _mm256_extractf128_si256(sy, 1);
215
+ // Perform multiplication and create 16-bit values
216
+ const __m128i dotl = _mm_maddubs_epi16(axl, syl);
217
+ const __m128i doth = _mm_maddubs_epi16(axh, syh);
218
+ return sum_i16_pairs_float(doth, dotl);
219
+ }
220
+
221
+ // multiply int8_t, add results pairwise twice and return as float vector
222
+ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
223
+ const __m128i xl = _mm256_castsi256_si128(x);
224
+ const __m128i xh = _mm256_extractf128_si256(x, 1);
225
+ const __m128i yl = _mm256_castsi256_si128(y);
226
+ const __m128i yh = _mm256_extractf128_si256(y, 1);
227
+ // Get absolute values of x vectors
228
+ const __m128i axl = _mm_sign_epi8(xl, xl);
229
+ const __m128i axh = _mm_sign_epi8(xh, xh);
230
+ // Sign the values of the y vectors
231
+ const __m128i syl = _mm_sign_epi8(yl, xl);
232
+ const __m128i syh = _mm_sign_epi8(yh, xh);
233
+ // Perform multiplication and create 16-bit values
234
+ const __m128i dotl = _mm_maddubs_epi16(axl, syl);
235
+ const __m128i doth = _mm_maddubs_epi16(axh, syh);
236
+ return sum_i16_pairs_float(doth, dotl);
237
+ }
238
+
239
+ // larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
240
+ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
241
+ const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
242
+ const __m128i mone = _mm_set1_epi16(1);
243
+
244
+ const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
245
+ const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
246
+ const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
247
+ const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
248
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
249
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
250
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
251
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
252
+ const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
253
+ const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
254
+ return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
255
+ }
256
+
257
+ // quad fp16 delta calculation
258
+ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
259
+ // GGML_FP16_TO_FP32 is faster than Intel F16C
260
+ return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
261
+ _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
262
+ }
263
+ #endif
264
+ #elif defined(__SSSE3__)
265
+ // horizontally add 4x4 floats
266
+ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
267
+ __m128 res_0 =_mm_hadd_ps(a, b);
268
+ __m128 res_1 =_mm_hadd_ps(c, d);
269
+ __m128 res =_mm_hadd_ps(res_0, res_1);
270
+ res =_mm_hadd_ps(res, res);
271
+ res =_mm_hadd_ps(res, res);
272
+
273
+ return _mm_cvtss_f32(res);
274
+ }
275
+ #endif // __AVX__ || __AVX2__ || __AVX512F__
276
+ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
277
+
278
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
279
+ assert(QK8_0 == 32);
280
+ assert(k % QK8_0 == 0);
281
+ const int nb = k / QK8_0;
282
+
283
+ block_q8_0 * GGML_RESTRICT y = vy;
284
+
285
+ #if defined(__AVX2__) || defined(__AVX__)
286
+ for (int i = 0; i < nb; i++) {
287
+ // Load elements into 4 AVX vectors
288
+ __m256 v0 = _mm256_loadu_ps( x );
289
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
290
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
291
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
292
+ x += 32;
293
+
294
+ // Compute max(abs(e)) for the block
295
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
296
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
297
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
298
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
299
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
300
+
301
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
302
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
303
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
304
+ const float maxScalar = _mm_cvtss_f32( max4 );
305
+
306
+ // Quantize these floats
307
+ const float d = maxScalar / 127.f;
308
+ y[i].d = GGML_FP32_TO_FP16(d);
309
+ const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
310
+ const __m256 mul = _mm256_set1_ps( id );
311
+
312
+ // Apply the multiplier
313
+ v0 = _mm256_mul_ps( v0, mul );
314
+ v1 = _mm256_mul_ps( v1, mul );
315
+ v2 = _mm256_mul_ps( v2, mul );
316
+ v3 = _mm256_mul_ps( v3, mul );
317
+
318
+ // Round to nearest integer
319
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
320
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
321
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
322
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
323
+
324
+ // Convert floats to integers
325
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
326
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
327
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
328
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
329
+
330
+ #if defined(__AVX2__)
331
+ // Convert int32 to int16
332
+ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
333
+ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
334
+ // Convert int16 to int8
335
+ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
336
+
337
+ // We got our precious signed bytes, but the order is now wrong
338
+ // These AVX2 pack instructions process 16-byte pieces independently
339
+ // The following instruction is fixing the order
340
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
341
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
342
+
343
+ _mm256_storeu_si256((__m256i *)y[i].qs, i0);
344
+ #else
345
+ // Since we don't have in AVX some necessary functions,
346
+ // we split the registers in half and call AVX2 analogs from SSE
347
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
348
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
349
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
350
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
351
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
352
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
353
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
354
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
355
+
356
+ // Convert int32 to int16
357
+ ni0 = _mm_packs_epi32( ni0, ni1 );
358
+ ni2 = _mm_packs_epi32( ni2, ni3 );
359
+ ni4 = _mm_packs_epi32( ni4, ni5 );
360
+ ni6 = _mm_packs_epi32( ni6, ni7 );
361
+ // Convert int16 to int8
362
+ ni0 = _mm_packs_epi16( ni0, ni2 );
363
+ ni4 = _mm_packs_epi16( ni4, ni6 );
364
+
365
+ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
366
+ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
367
+ #endif
368
+ }
369
+ #else
370
+ GGML_UNUSED(nb);
371
+ // scalar
372
+ quantize_row_q8_0_ref(x, y, k);
373
+ #endif
374
+ }
375
+
376
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
377
+ assert(k % QK8_1 == 0);
378
+ const int nb = k / QK8_1;
379
+
380
+ block_q8_1 * GGML_RESTRICT y = vy;
381
+ #if defined(__AVX2__) || defined(__AVX__)
382
+ for (int i = 0; i < nb; i++) {
383
+ // Load elements into 4 AVX vectors
384
+ __m256 v0 = _mm256_loadu_ps( x );
385
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
386
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
387
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
388
+ x += 32;
389
+
390
+ // Compute max(abs(e)) for the block
391
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
392
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
393
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
394
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
395
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
396
+
397
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
398
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
399
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
400
+ const float max_scalar = _mm_cvtss_f32( max4 );
401
+
402
+ // Quantize these floats
403
+ const float d = max_scalar / 127.f;
404
+ y[i].d = GGML_FP32_TO_FP16(d);
405
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
406
+ const __m256 mul = _mm256_set1_ps( id );
407
+
408
+ // Apply the multiplier
409
+ v0 = _mm256_mul_ps( v0, mul );
410
+ v1 = _mm256_mul_ps( v1, mul );
411
+ v2 = _mm256_mul_ps( v2, mul );
412
+ v3 = _mm256_mul_ps( v3, mul );
413
+
414
+ // Round to nearest integer
415
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
416
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
417
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
418
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
419
+
420
+ // Convert floats to integers
421
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
422
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
423
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
424
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
425
+
426
+ #if defined(__AVX2__)
427
+ // Compute the sum of the quants and set y[i].s
428
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
429
+
430
+ // Convert int32 to int16
431
+ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
432
+ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
433
+ // Convert int16 to int8
434
+ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
435
+
436
+ // We got our precious signed bytes, but the order is now wrong
437
+ // These AVX2 pack instructions process 16-byte pieces independently
438
+ // The following instruction is fixing the order
439
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
440
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
441
+
442
+ _mm256_storeu_si256((__m256i *)y[i].qs, i0);
443
+ #else
444
+ // Since we don't have in AVX some necessary functions,
445
+ // we split the registers in half and call AVX2 analogs from SSE
446
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
447
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
448
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
449
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
450
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
451
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
452
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
453
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
454
+
455
+ // Compute the sum of the quants and set y[i].s
456
+ const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
457
+ const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
458
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
459
+
460
+ // Convert int32 to int16
461
+ ni0 = _mm_packs_epi32( ni0, ni1 );
462
+ ni2 = _mm_packs_epi32( ni2, ni3 );
463
+ ni4 = _mm_packs_epi32( ni4, ni5 );
464
+ ni6 = _mm_packs_epi32( ni6, ni7 );
465
+ // Convert int16 to int8
466
+ ni0 = _mm_packs_epi16( ni0, ni2 );
467
+ ni4 = _mm_packs_epi16( ni4, ni6 );
468
+
469
+ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
470
+ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
471
+ #endif
472
+ }
473
+ #else
474
+ GGML_UNUSED(nb);
475
+ // scalar
476
+ quantize_row_q8_1_ref(x, y, k);
477
+ #endif
478
+ }
479
+
480
+ // placeholder implementation for Apple targets
481
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
482
+ quantize_row_q8_K_ref(x, y, k);
483
+ }
484
+
485
+ //===================================== Dot products =================================
486
+
487
+ //
488
+ // Helper functions
489
+ //
490
+
491
+ #if __AVX__ || __AVX2__ || __AVX512F__
492
+
493
+ // shuffles to pick the required scales in dot products
494
+ static inline __m256i get_scale_shuffle_q3k(int i) {
495
+ static const uint8_t k_shuffle[128] = {
496
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
497
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
498
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
499
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
500
+ };
501
+ return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
502
+ }
503
+ static inline __m256i get_scale_shuffle_k4(int i) {
504
+ static const uint8_t k_shuffle[256] = {
505
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
506
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
507
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
508
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
509
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
510
+ 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
511
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
512
+ 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
513
+ };
514
+ return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
515
+ }
516
+ static inline __m128i get_scale_shuffle(int i) {
517
+ static const uint8_t k_shuffle[128] = {
518
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
519
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
520
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
521
+ 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
522
+ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
523
+ 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
524
+ 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
525
+ 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
526
+ };
527
+ return _mm_loadu_si128((const __m128i*)k_shuffle + i);
528
+ }
529
+ #endif
530
+
531
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
532
+ const int qk = QK8_0;
533
+ const int nb = n / qk;
534
+
535
+ assert(n % qk == 0);
536
+ assert(nrc == 1);
537
+ UNUSED(nrc);
538
+ UNUSED(bx);
539
+ UNUSED(by);
540
+ UNUSED(bs);
541
+
542
+ const block_q4_0 * GGML_RESTRICT x = vx;
543
+ const block_q8_0 * GGML_RESTRICT y = vy;
544
+
545
+ int ib = 0;
546
+ float sumf = 0;
547
+
548
+ #if defined(__AVX2__)
549
+ // Initialize accumulator with zeros
550
+ __m256 acc = _mm256_setzero_ps();
551
+
552
+ // Main loop
553
+ for (; ib < nb; ++ib) {
554
+ /* Compute combined scale for the block */
555
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
556
+
557
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
558
+
559
+ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
560
+ const __m256i off = _mm256_set1_epi8( 8 );
561
+ qx = _mm256_sub_epi8( qx, off );
562
+
563
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
564
+
565
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
566
+
567
+ /* Multiply q with scale and accumulate */
568
+ acc = _mm256_fmadd_ps( d, q, acc );
569
+ }
570
+
571
+ sumf = hsum_float_8(acc);
572
+ #elif defined(__AVX__)
573
+ __m256 accum = _mm256_setzero_ps();
574
+ for (; ib + 1 < nb; ib += 2) {
575
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
576
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
577
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
578
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
579
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
580
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
581
+
582
+ const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
583
+ const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
584
+ const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
585
+ const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
586
+
587
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
588
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
589
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
590
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
591
+ const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
592
+ const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
593
+ const __m256 p = sum_i16_pairs_float(p_2, p_1);
594
+
595
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
596
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
597
+ }
598
+
599
+ sumf = hsum_float_8(accum);
600
+ #elif defined(__SSSE3__)
601
+ // set constants
602
+ const __m128i lowMask = _mm_set1_epi8(0xF);
603
+ const __m128i off = _mm_set1_epi8(8);
604
+
605
+ // Initialize accumulator with zeros
606
+ __m128 acc_0 = _mm_setzero_ps();
607
+ __m128 acc_1 = _mm_setzero_ps();
608
+ __m128 acc_2 = _mm_setzero_ps();
609
+ __m128 acc_3 = _mm_setzero_ps();
610
+
611
+ for (; ib + 1 < nb; ib += 2) {
612
+ _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
613
+ _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
614
+
615
+ // Compute combined scale for the block 0 and 1
616
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
617
+
618
+ const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
619
+
620
+ __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
621
+ __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
622
+ bx_0 = _mm_sub_epi8(bx_0, off);
623
+ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
624
+
625
+ __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
626
+ __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
627
+ bx_1 = _mm_sub_epi8(bx_1, off);
628
+ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
629
+
630
+ _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
631
+ _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
632
+
633
+ // Compute combined scale for the block 2 and 3
634
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
635
+
636
+ const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
637
+
638
+ __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
639
+ __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
640
+ bx_2 = _mm_sub_epi8(bx_2, off);
641
+ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
642
+
643
+ __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
644
+ __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
645
+ bx_3 = _mm_sub_epi8(bx_3, off);
646
+ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
647
+
648
+ // Convert int32_t to float
649
+ __m128 p0 = _mm_cvtepi32_ps(i32_0);
650
+ __m128 p1 = _mm_cvtepi32_ps(i32_1);
651
+ __m128 p2 = _mm_cvtepi32_ps(i32_2);
652
+ __m128 p3 = _mm_cvtepi32_ps(i32_3);
653
+
654
+ // Apply the scale
655
+ __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
656
+ __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
657
+ __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
658
+ __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
659
+
660
+ // Acummulate
661
+ acc_0 = _mm_add_ps(p0_d, acc_0);
662
+ acc_1 = _mm_add_ps(p1_d, acc_1);
663
+ acc_2 = _mm_add_ps(p2_d, acc_2);
664
+ acc_3 = _mm_add_ps(p3_d, acc_3);
665
+ }
666
+
667
+ sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
668
+
669
+ #endif
670
+ for (; ib < nb; ++ib) {
671
+ int sumi0 = 0;
672
+ int sumi1 = 0;
673
+
674
+ for (int j = 0; j < qk/2; ++j) {
675
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
676
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
677
+
678
+ sumi0 += (v0 * y[ib].qs[j]);
679
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
680
+ }
681
+
682
+ int sumi = sumi0 + sumi1;
683
+ sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
684
+ }
685
+
686
+ *s = sumf;
687
+ }
688
+
689
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
690
+ const int qk = QK8_1;
691
+ const int nb = n / qk;
692
+
693
+ assert(n % qk == 0);
694
+ assert(nrc == 1);
695
+ UNUSED(nrc);
696
+ UNUSED(bx);
697
+ UNUSED(by);
698
+ UNUSED(bs);
699
+
700
+ const block_q4_1 * GGML_RESTRICT x = vx;
701
+ const block_q8_1 * GGML_RESTRICT y = vy;
702
+
703
+ int ib = 0;
704
+ float sumf = 0;
705
+
706
+ #if defined(__AVX2__) || defined(__AVX__)
707
+ // Initialize accumulator with zeros
708
+ __m256 acc = _mm256_setzero_ps();
709
+
710
+ float summs = 0;
711
+
712
+ // Main loop
713
+ for (; ib < nb; ++ib) {
714
+ const float d0 = GGML_FP16_TO_FP32(x[ib].d);
715
+ const float d1 = GGML_FP16_TO_FP32(y[ib].d);
716
+
717
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
718
+
719
+ const __m256 d0v = _mm256_set1_ps( d0 );
720
+ const __m256 d1v = _mm256_set1_ps( d1 );
721
+
722
+ // Compute combined scales
723
+ const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
724
+
725
+ // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
726
+ const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
727
+ const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
728
+
729
+ const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
730
+
731
+ // Accumulate d0*d1*x*y
732
+ #if defined(__AVX2__)
733
+ acc = _mm256_fmadd_ps( d0d1, xy, acc );
734
+ #else
735
+ acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
736
+ #endif
737
+ }
738
+
739
+ sumf = hsum_float_8(acc) + summs;
740
+
741
+ #endif
742
+ for (; ib < nb; ++ib) {
743
+ int sumi0 = 0;
744
+ int sumi1 = 0;
745
+
746
+ for (int j = 0; j < qk/2; ++j) {
747
+ const int v0 = (x[ib].qs[j] & 0x0F);
748
+ const int v1 = (x[ib].qs[j] >> 4);
749
+
750
+ sumi0 += (v0 * y[ib].qs[j]);
751
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
752
+ }
753
+
754
+ int sumi = sumi0 + sumi1;
755
+ sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
756
+ }
757
+
758
+ *s = sumf;
759
+ }
760
+
761
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
762
+ const int qk = QK8_0;
763
+ const int nb = n / qk;
764
+
765
+ int ib = 0;
766
+ float sumf = 0;
767
+
768
+ assert(n % qk == 0);
769
+ assert(qk == QK5_0);
770
+ assert(nrc == 1);
771
+ UNUSED(nrc);
772
+ UNUSED(bx);
773
+ UNUSED(by);
774
+ UNUSED(bs);
775
+
776
+ const block_q5_0 * GGML_RESTRICT x = vx;
777
+ const block_q8_0 * GGML_RESTRICT y = vy;
778
+
779
+ #if defined(__AVX2__)
780
+ // Initialize accumulator with zeros
781
+ __m256 acc = _mm256_setzero_ps();
782
+
783
+ // Main loop
784
+ for (; ib < nb; ++ib) {
785
+ /* Compute combined scale for the block */
786
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
787
+
788
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
789
+ __m256i bxhi = bytes_from_bits_32(x[ib].qh);
790
+ bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
791
+ qx = _mm256_or_si256(qx, bxhi);
792
+
793
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
794
+
795
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
796
+
797
+ /* Multiply q with scale and accumulate */
798
+ acc = _mm256_fmadd_ps(d, q, acc);
799
+ }
800
+
801
+ sumf = hsum_float_8(acc);
802
+ #elif defined(__AVX__)
803
+ // Initialize accumulator with zeros
804
+ __m256 acc = _mm256_setzero_ps();
805
+ __m128i mask = _mm_set1_epi8((char)0xF0);
806
+
807
+ // Main loop
808
+ for (; ib < nb; ++ib) {
809
+ /* Compute combined scale for the block */
810
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
811
+
812
+ __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
813
+ const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
814
+ __m128i bxhil = _mm256_castsi256_si128(bxhi);
815
+ __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
816
+ bxhil = _mm_andnot_si128(bxhil, mask);
817
+ bxhih = _mm_andnot_si128(bxhih, mask);
818
+ __m128i bxl = _mm256_castsi256_si128(bx_0);
819
+ __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
820
+ bxl = _mm_or_si128(bxl, bxhil);
821
+ bxh = _mm_or_si128(bxh, bxhih);
822
+ bx_0 = MM256_SET_M128I(bxh, bxl);
823
+
824
+ const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
825
+
826
+ const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
827
+
828
+ /* Multiply q with scale and accumulate */
829
+ acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
830
+ }
831
+
832
+ sumf = hsum_float_8(acc);
833
+
834
+ #endif
835
+ for (; ib < nb; ++ib) {
836
+ uint32_t qh;
837
+ memcpy(&qh, x[ib].qh, sizeof(qh));
838
+
839
+ int sumi0 = 0;
840
+ int sumi1 = 0;
841
+
842
+ for (int j = 0; j < qk/2; ++j) {
843
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
844
+ const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
845
+
846
+ const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
847
+ const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
848
+
849
+ sumi0 += (x0 * y[ib].qs[j]);
850
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
851
+ }
852
+
853
+ int sumi = sumi0 + sumi1;
854
+ sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
855
+ }
856
+
857
+ *s = sumf;
858
+ }
859
+
860
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
861
+ const int qk = QK8_1;
862
+ const int nb = n / qk;
863
+
864
+ int ib = 0;
865
+ float sumf = 0;
866
+
867
+ assert(n % qk == 0);
868
+ assert(qk == QK5_1);
869
+ assert(nrc == 1);
870
+ UNUSED(nrc);
871
+ UNUSED(bx);
872
+ UNUSED(by);
873
+ UNUSED(bs);
874
+
875
+ const block_q5_1 * GGML_RESTRICT x = vx;
876
+ const block_q8_1 * GGML_RESTRICT y = vy;
877
+
878
+ #if defined(__AVX2__)
879
+ // Initialize accumulator with zeros
880
+ __m256 acc = _mm256_setzero_ps();
881
+
882
+ float summs = 0.0f;
883
+
884
+ // Main loop
885
+ for (; ib < nb; ++ib) {
886
+ const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
887
+
888
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
889
+
890
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
891
+ __m256i bxhi = bytes_from_bits_32(x[ib].qh);
892
+ bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
893
+ qx = _mm256_or_si256(qx, bxhi);
894
+
895
+ const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
896
+ const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
897
+
898
+ const __m256 q = mul_sum_us8_pairs_float(qx, qy);
899
+
900
+ acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
901
+ }
902
+
903
+ sumf = hsum_float_8(acc) + summs;
904
+ #elif defined(__AVX__)
905
+ // Initialize accumulator with zeros
906
+ __m256 acc = _mm256_setzero_ps();
907
+ __m128i mask = _mm_set1_epi8(0x10);
908
+
909
+ float summs = 0.0f;
910
+
911
+ // Main loop
912
+ for (; ib < nb; ++ib) {
913
+ const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
914
+
915
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
916
+
917
+ __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
918
+ const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
919
+ __m128i bxhil = _mm256_castsi256_si128(bxhi);
920
+ __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
921
+ bxhil = _mm_and_si128(bxhil, mask);
922
+ bxhih = _mm_and_si128(bxhih, mask);
923
+ __m128i bxl = _mm256_castsi256_si128(bx_0);
924
+ __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
925
+ bxl = _mm_or_si128(bxl, bxhil);
926
+ bxh = _mm_or_si128(bxh, bxhih);
927
+ bx_0 = MM256_SET_M128I(bxh, bxl);
928
+
929
+ const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
930
+ const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
931
+
932
+ const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
933
+
934
+ acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
935
+ }
936
+
937
+ sumf = hsum_float_8(acc) + summs;
938
+
939
+ #endif
940
+ for (; ib < nb; ++ib) {
941
+ uint32_t qh;
942
+ memcpy(&qh, x[ib].qh, sizeof(qh));
943
+
944
+ int sumi0 = 0;
945
+ int sumi1 = 0;
946
+
947
+ for (int j = 0; j < qk/2; ++j) {
948
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
949
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
950
+
951
+ const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
952
+ const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
953
+
954
+ sumi0 += (x0 * y[ib].qs[j]);
955
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
956
+ }
957
+
958
+ int sumi = sumi0 + sumi1;
959
+ sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
960
+ }
961
+
962
+ *s = sumf;
963
+ }
964
+
965
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
966
+ const int qk = QK8_0;
967
+ const int nb = n / qk;
968
+
969
+ assert(n % qk == 0);
970
+ assert(nrc == 1);
971
+ UNUSED(nrc);
972
+ UNUSED(bx);
973
+ UNUSED(by);
974
+ UNUSED(bs);
975
+
976
+ const block_q8_0 * GGML_RESTRICT x = vx;
977
+ const block_q8_0 * GGML_RESTRICT y = vy;
978
+
979
+ int ib = 0;
980
+ float sumf = 0;
981
+
982
+ #if defined(__AVX2__)
983
+ // Initialize accumulator with zeros
984
+ __m256 acc = _mm256_setzero_ps();
985
+
986
+ // Main loop
987
+ for (; ib < nb; ++ib) {
988
+ // Compute combined scale for the block
989
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
990
+ __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
991
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
992
+
993
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
994
+
995
+ // Multiply q with scale and accumulate
996
+ acc = _mm256_fmadd_ps( d, q, acc );
997
+ }
998
+
999
+ sumf = hsum_float_8(acc);
1000
+ #elif defined(__AVX__)
1001
+ __m256 accum = _mm256_setzero_ps();
1002
+
1003
+ for (; ib + 1 < nb; ib += 2) {
1004
+ const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
1005
+ const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
1006
+ const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
1007
+ const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
1008
+ const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
1009
+ const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
1010
+ const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
1011
+ const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
1012
+
1013
+ const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
1014
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
1015
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
1016
+ }
1017
+
1018
+ sumf = hsum_float_8(accum);
1019
+
1020
+ #endif
1021
+ for (; ib < nb; ++ib) {
1022
+ int sumi = 0;
1023
+
1024
+ for (int j = 0; j < qk; j++) {
1025
+ sumi += x[ib].qs[j]*y[ib].qs[j];
1026
+ }
1027
+
1028
+ sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
1029
+ }
1030
+
1031
+ *s = sumf;
1032
+ }
1033
+
1034
+ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1035
+ assert(nrc == 1);
1036
+ UNUSED(nrc);
1037
+ UNUSED(bx);
1038
+ UNUSED(by);
1039
+ UNUSED(bs);
1040
+
1041
+ const block_tq1_0 * GGML_RESTRICT x = vx;
1042
+ const block_q8_K * GGML_RESTRICT y = vy;
1043
+
1044
+ const int nb = n / QK_K;
1045
+
1046
+ #if defined(__AVX2__)
1047
+ __m256 sumf = _mm256_setzero_ps();
1048
+
1049
+ for (int i = 0; i < nb; ++i) {
1050
+ // 16-bit sums
1051
+ __m256i sumi0 = _mm256_setzero_si256();
1052
+ __m256i sumi1 = _mm256_setzero_si256();
1053
+ __m256i sumi2 = _mm256_setzero_si256();
1054
+
1055
+ // first 32 bytes of 5 elements
1056
+ {
1057
+ __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
1058
+ // 8-bit multiplies with shifts, masks and adds
1059
+ __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
1060
+ __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
1061
+ __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
1062
+ __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
1063
+
1064
+ // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
1065
+
1066
+ // Cancel the +1 from avg so that it behaves like a halving add
1067
+ qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
1068
+ qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
1069
+ qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
1070
+ qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
1071
+ qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
1072
+ // Multiply by 3 and get the top 2 bits
1073
+ qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
1074
+ qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
1075
+ qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
1076
+ qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
1077
+ qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
1078
+ qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
1079
+ qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
1080
+ qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
1081
+ qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
1082
+ qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
1083
+
1084
+ const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0));
1085
+ const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32));
1086
+ const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64));
1087
+ const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96));
1088
+ const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
1089
+
1090
+ qx0 = _mm256_maddubs_epi16(qx0, qy0);
1091
+ qx1 = _mm256_maddubs_epi16(qx1, qy1);
1092
+ qx2 = _mm256_maddubs_epi16(qx2, qy2);
1093
+ qx3 = _mm256_maddubs_epi16(qx3, qy3);
1094
+ qx4 = _mm256_maddubs_epi16(qx4, qy4);
1095
+
1096
+ sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
1097
+ sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
1098
+ sumi2 = _mm256_add_epi16(sumi2, qx4);
1099
+ }
1100
+
1101
+ // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
1102
+ {
1103
+ __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
1104
+ uint32_t qh;
1105
+ memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
1106
+ __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
1107
+ __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
1108
+ __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
1109
+ __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
1110
+ __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
1111
+ __m256i qx01 = MM256_SET_M128I(qx1, qx0);
1112
+ __m256i qx23 = MM256_SET_M128I(qx3, qx2);
1113
+
1114
+ // avx2 does not have 8-bit multiplies, so 16-bit it is.
1115
+ qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
1116
+ qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
1117
+ __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
1118
+
1119
+ __m256i qx45 = MM256_SET_M128I(qx5, qx4);
1120
+
1121
+ // Cancel the +1 from avg so that it behaves like a halving add
1122
+ qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
1123
+ qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
1124
+ qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
1125
+ // Multiply by 3 and get the top 2 bits
1126
+ qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
1127
+ qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
1128
+ qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
1129
+ qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
1130
+ qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
1131
+ qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
1132
+
1133
+ const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
1134
+ const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
1135
+ const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
1136
+
1137
+ qx01 = _mm256_maddubs_epi16(qx01, qy01);
1138
+ qx23 = _mm256_maddubs_epi16(qx23, qy23);
1139
+ qx45 = _mm256_maddubs_epi16(qx45, qy45);
1140
+
1141
+ sumi0 = _mm256_add_epi16(sumi0, qx01);
1142
+ sumi1 = _mm256_add_epi16(sumi1, qx23);
1143
+ sumi2 = _mm256_add_epi16(sumi2, qx45);
1144
+ }
1145
+
1146
+ const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
1147
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
1148
+
1149
+ sumi0 = _mm256_sub_epi16(sumi0, ysum);
1150
+ sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
1151
+ sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
1152
+
1153
+ sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
1154
+ }
1155
+
1156
+ *s = hsum_float_8(sumf);
1157
+
1158
+ #else
1159
+ const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
1160
+
1161
+ float sumf = 0.0f;
1162
+
1163
+ for (int i = 0; i < nb; ++i) {
1164
+ int sum = 0;
1165
+
1166
+ for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
1167
+ for (size_t l = 0; l < 5; ++l) {
1168
+ for (size_t m = 0; m < 32; ++m) {
1169
+ uint8_t q = x[i].qs[j + m] * pow3[l];
1170
+ uint16_t xi = ((uint16_t) q * 3) >> 8;
1171
+ sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
1172
+ }
1173
+ }
1174
+ }
1175
+ for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
1176
+ for (size_t l = 0; l < 5; ++l) {
1177
+ for (size_t m = 0; m < 16; ++m) {
1178
+ uint8_t q = x[i].qs[j + m] * pow3[l];
1179
+ uint16_t xi = ((uint16_t) q * 3) >> 8;
1180
+ sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
1181
+ }
1182
+ }
1183
+ }
1184
+
1185
+ for (size_t l = 0; l < 4; ++l) {
1186
+ for (size_t j = 0; j < sizeof(x->qh); ++j) {
1187
+ uint8_t q = x[i].qh[j] * pow3[l];
1188
+ uint16_t xi = ((uint16_t) q * 3) >> 8;
1189
+ sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
1190
+ }
1191
+ }
1192
+
1193
+ sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
1194
+ }
1195
+
1196
+ *s = sumf;
1197
+ #endif
1198
+ }
1199
+
1200
+ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1201
+ assert(nrc == 1);
1202
+ UNUSED(nrc);
1203
+ UNUSED(bx);
1204
+ UNUSED(by);
1205
+ UNUSED(bs);
1206
+
1207
+ const block_tq2_0 * GGML_RESTRICT x = vx;
1208
+ const block_q8_K * GGML_RESTRICT y = vy;
1209
+
1210
+ const int nb = n / QK_K;
1211
+
1212
+ #if defined(__AVX2__)
1213
+ __m256 sumf = _mm256_setzero_ps();
1214
+
1215
+ for (int i = 0; i < nb; ++i) {
1216
+ // 16-bit sums, because 256*127 still fits
1217
+ __m256i sumi0 = _mm256_setzero_si256();
1218
+ __m256i sumi1 = _mm256_setzero_si256();
1219
+
1220
+ for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1221
+ __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
1222
+ __m256i qx1 = _mm256_srli_epi16(qx0, 2);
1223
+ __m256i qx2 = _mm256_srli_epi16(qx0, 4);
1224
+ __m256i qx3 = _mm256_srli_epi16(qx0, 6);
1225
+
1226
+ // 0, 1, 2 (should not be 3)
1227
+ qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
1228
+ qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
1229
+ qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
1230
+ qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
1231
+
1232
+ const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0));
1233
+ const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
1234
+ const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
1235
+ const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
1236
+
1237
+ qx0 = _mm256_maddubs_epi16(qx0, qy0);
1238
+ qx1 = _mm256_maddubs_epi16(qx1, qy1);
1239
+ qx2 = _mm256_maddubs_epi16(qx2, qy2);
1240
+ qx3 = _mm256_maddubs_epi16(qx3, qy3);
1241
+
1242
+ sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
1243
+ sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
1244
+ }
1245
+
1246
+ const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
1247
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
1248
+
1249
+ sumi0 = _mm256_add_epi16(sumi0, sumi1);
1250
+ sumi0 = _mm256_sub_epi16(sumi0, ysum);
1251
+ sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
1252
+
1253
+ sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
1254
+ }
1255
+
1256
+ *s = hsum_float_8(sumf);
1257
+
1258
+ #else
1259
+ float sumf = 0.0f;
1260
+
1261
+ for (int i = 0; i < nb; ++i) {
1262
+ int32_t sumi = 0;
1263
+
1264
+ for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1265
+ for (size_t l = 0; l < 4; ++l) {
1266
+ for (size_t k = 0; k < 32; ++k) {
1267
+ sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
1268
+ }
1269
+ }
1270
+ }
1271
+
1272
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1273
+
1274
+ sumf += (float) sumi * d;
1275
+ }
1276
+
1277
+ *s = sumf;
1278
+ #endif
1279
+ }
1280
+
1281
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1282
+ assert(nrc == 1);
1283
+ UNUSED(nrc);
1284
+ UNUSED(bx);
1285
+ UNUSED(by);
1286
+ UNUSED(bs);
1287
+
1288
+ const block_q2_K * GGML_RESTRICT x = vx;
1289
+ const block_q8_K * GGML_RESTRICT y = vy;
1290
+
1291
+ const int nb = n / QK_K;
1292
+
1293
+ #if defined __AVX2__
1294
+
1295
+ const __m256i m3 = _mm256_set1_epi8(3);
1296
+ const __m128i m4 = _mm_set1_epi8(0xF);
1297
+
1298
+ __m256 acc = _mm256_setzero_ps();
1299
+
1300
+ for (int i = 0; i < nb; ++i) {
1301
+
1302
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1303
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1304
+
1305
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1306
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1307
+
1308
+ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
1309
+ const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
1310
+ const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
1311
+ const __m256i mins = _mm256_cvtepi8_epi16(mins8);
1312
+ const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
1313
+
1314
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
1315
+
1316
+ const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1317
+ const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1318
+ const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1319
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1320
+
1321
+ __m256i sumi = _mm256_setzero_si256();
1322
+
1323
+ for (int j = 0; j < QK_K/128; ++j) {
1324
+
1325
+ const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
1326
+
1327
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1328
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1329
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1330
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1331
+
1332
+ const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
1333
+ const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
1334
+ const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
1335
+ const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
1336
+
1337
+ __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
1338
+ __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
1339
+ __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
1340
+ __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
1341
+
1342
+ p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
1343
+ p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
1344
+ p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
1345
+ p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
1346
+
1347
+ p0 = _mm256_add_epi32(p0, p1);
1348
+ p2 = _mm256_add_epi32(p2, p3);
1349
+
1350
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
1351
+ }
1352
+
1353
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
1354
+
1355
+ }
1356
+
1357
+ *s = hsum_float_8(acc);
1358
+
1359
+ #elif defined __AVX__
1360
+
1361
+ const __m128i m3 = _mm_set1_epi8(0x3);
1362
+ const __m128i m4 = _mm_set1_epi8(0xF);
1363
+ const __m128i m2 = _mm_set1_epi8(0x2);
1364
+
1365
+ __m256 acc = _mm256_setzero_ps();
1366
+
1367
+ for (int i = 0; i < nb; ++i) {
1368
+
1369
+ const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1370
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1371
+
1372
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1373
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1374
+
1375
+ // load mins and scales from block_q2_K.scales[QK_K/16]
1376
+ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
1377
+ const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
1378
+ const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
1379
+ const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
1380
+ const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
1381
+
1382
+ // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
1383
+ const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
1384
+ const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1385
+
1386
+ // sumf += -dmin * summs in 32bits*8
1387
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1388
+
1389
+ const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1390
+ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
1391
+ const __m128i scales[2] = { scales_0, scales_1 };
1392
+
1393
+ __m128i sumi_0 = _mm_setzero_si128();
1394
+ __m128i sumi_1 = _mm_setzero_si128();
1395
+
1396
+ for (int j = 0; j < QK_K/128; ++j) {
1397
+
1398
+ // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
1399
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1400
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1401
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1402
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1403
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1404
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1405
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1406
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1407
+
1408
+ // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
1409
+ __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
1410
+ const __m128i q2_0 = _mm_and_si128(q2bits, m3);
1411
+ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1412
+ const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1413
+ const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1414
+ q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
1415
+ const __m128i q2_1 = _mm_and_si128(q2bits, m3);
1416
+ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1417
+ const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1418
+ const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1419
+
1420
+ // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
1421
+ __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
1422
+ __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
1423
+ __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
1424
+ __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
1425
+ __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
1426
+ __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
1427
+ __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
1428
+ __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
1429
+
1430
+ // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
1431
+ __m128i shuffle = _mm_set1_epi16(0x0100);
1432
+ p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
1433
+ shuffle = _mm_add_epi16(shuffle, m2);
1434
+ p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
1435
+ shuffle = _mm_add_epi16(shuffle, m2);
1436
+ p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
1437
+ shuffle = _mm_add_epi16(shuffle, m2);
1438
+ p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
1439
+ shuffle = _mm_add_epi16(shuffle, m2);
1440
+ p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
1441
+ shuffle = _mm_add_epi16(shuffle, m2);
1442
+ p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
1443
+ shuffle = _mm_add_epi16(shuffle, m2);
1444
+ p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
1445
+ shuffle = _mm_add_epi16(shuffle, m2);
1446
+ p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
1447
+
1448
+ p0 = _mm_add_epi32(p0, p1);
1449
+ p2 = _mm_add_epi32(p2, p3);
1450
+ p4 = _mm_add_epi32(p4, p5);
1451
+ p6 = _mm_add_epi32(p6, p7);
1452
+
1453
+ // isum in 32bits*4*2
1454
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
1455
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
1456
+ }
1457
+
1458
+ // sumf += dall * isum - dmin * summs in 32bits
1459
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1460
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1461
+ }
1462
+
1463
+ *s = hsum_float_8(acc);
1464
+
1465
+ #else
1466
+
1467
+ float sumf = 0;
1468
+
1469
+ for (int i = 0; i < nb; ++i) {
1470
+
1471
+ const uint8_t * q2 = x[i].qs;
1472
+ const int8_t * q8 = y[i].qs;
1473
+ const uint8_t * sc = x[i].scales;
1474
+
1475
+ int summs = 0;
1476
+ for (int j = 0; j < 16; ++j) {
1477
+ summs += y[i].bsums[j] * (sc[j] >> 4);
1478
+ }
1479
+
1480
+ const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1481
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1482
+
1483
+ int isum = 0;
1484
+ int is = 0;
1485
+ int d;
1486
+ for (int k = 0; k < QK_K/128; ++k) {
1487
+ int shift = 0;
1488
+ for (int j = 0; j < 4; ++j) {
1489
+ d = sc[is++] & 0xF;
1490
+ int isuml = 0;
1491
+ for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1492
+ isum += d * isuml;
1493
+ d = sc[is++] & 0xF;
1494
+ isuml = 0;
1495
+ for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1496
+ isum += d * isuml;
1497
+ shift += 2;
1498
+ q8 += 32;
1499
+ }
1500
+ q2 += 32;
1501
+ }
1502
+ sumf += dall * isum - dmin * summs;
1503
+ }
1504
+ *s = sumf;
1505
+ #endif
1506
+ }
1507
+
1508
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1509
+ assert(n % QK_K == 0);
1510
+ assert(nrc == 1);
1511
+ UNUSED(nrc);
1512
+ UNUSED(bx);
1513
+ UNUSED(by);
1514
+ UNUSED(bs);
1515
+
1516
+ const uint32_t kmask1 = 0x03030303;
1517
+ const uint32_t kmask2 = 0x0f0f0f0f;
1518
+
1519
+ const block_q3_K * GGML_RESTRICT x = vx;
1520
+ const block_q8_K * GGML_RESTRICT y = vy;
1521
+
1522
+ const int nb = n / QK_K;
1523
+
1524
+ #if defined __AVX2__
1525
+
1526
+ const __m256i m3 = _mm256_set1_epi8(3);
1527
+ const __m256i mone = _mm256_set1_epi8(1);
1528
+ const __m128i m32 = _mm_set1_epi8(32);
1529
+
1530
+ __m256 acc = _mm256_setzero_ps();
1531
+
1532
+ uint32_t aux[3];
1533
+
1534
+ for (int i = 0; i < nb; ++i) {
1535
+
1536
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1537
+
1538
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1539
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1540
+
1541
+ // Set up scales
1542
+ memcpy(aux, x[i].scales, 12);
1543
+ __m128i scales128 = _mm_set_epi32(
1544
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
1545
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
1546
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
1547
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
1548
+ scales128 = _mm_sub_epi8(scales128, m32);
1549
+ const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1550
+ const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1551
+ const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1552
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1553
+
1554
+ // high bit
1555
+ const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
1556
+
1557
+ // integer accumulator
1558
+ __m256i sumi = _mm256_setzero_si256();
1559
+
1560
+ int bit = 0;
1561
+ int is = 0;
1562
+
1563
+ for (int j = 0; j < QK_K/128; ++j) {
1564
+ // load low 2 bits
1565
+ const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
1566
+
1567
+ // prepare low and high bits
1568
+ const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
1569
+ const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1570
+ ++bit;
1571
+
1572
+ const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
1573
+ const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1574
+ ++bit;
1575
+
1576
+ const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
1577
+ const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1578
+ ++bit;
1579
+
1580
+ const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
1581
+ const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
1582
+ ++bit;
1583
+
1584
+ // load Q8 quants
1585
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1586
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1587
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1588
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1589
+
1590
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
1591
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
1592
+ // and 2 if the high bit was set)
1593
+ __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
1594
+ __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
1595
+ __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
1596
+ __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
1597
+
1598
+ __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
1599
+ __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
1600
+ __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
1601
+ __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
1602
+
1603
+ p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
1604
+ p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
1605
+ p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
1606
+ p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
1607
+
1608
+ // multiply with scales
1609
+ p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
1610
+ p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
1611
+ p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
1612
+ p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
1613
+
1614
+ // accumulate
1615
+ p16_0 = _mm256_add_epi32(p16_0, p16_1);
1616
+ p16_2 = _mm256_add_epi32(p16_2, p16_3);
1617
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
1618
+
1619
+ }
1620
+
1621
+ // multiply with block scale and accumulate
1622
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
1623
+
1624
+ }
1625
+
1626
+ *s = hsum_float_8(acc);
1627
+
1628
+ #elif defined __AVX__
1629
+
1630
+ const __m128i m3 = _mm_set1_epi8(3);
1631
+ const __m128i mone = _mm_set1_epi8(1);
1632
+ const __m128i m32 = _mm_set1_epi8(32);
1633
+ const __m128i m2 = _mm_set1_epi8(2);
1634
+
1635
+ __m256 acc = _mm256_setzero_ps();
1636
+
1637
+ const uint32_t *aux;
1638
+
1639
+ for (int i = 0; i < nb; ++i) {
1640
+
1641
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1642
+
1643
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1644
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1645
+
1646
+ // Set up scales
1647
+ aux = (const uint32_t *)x[i].scales;
1648
+ __m128i scales128 = _mm_set_epi32(
1649
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
1650
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
1651
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
1652
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
1653
+ scales128 = _mm_sub_epi8(scales128, m32);
1654
+ const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
1655
+ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
1656
+ const __m128i scales[2] = { scales_0, scales_1 };
1657
+
1658
+ // high bit *128*2 from block_q3_K.hmask[QK_K/8]
1659
+ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
1660
+ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
1661
+
1662
+ // integer accumulator
1663
+ __m128i sumi_0 = _mm_setzero_si128();
1664
+ __m128i sumi_1 = _mm_setzero_si128();
1665
+
1666
+ for (int j = 0; j < QK_K/128; ++j) {
1667
+ // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
1668
+ const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
1669
+ const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
1670
+
1671
+ // prepare low and high bits
1672
+ const int bit = j << 2;
1673
+
1674
+ const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
1675
+ const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
1676
+ const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
1677
+ const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
1678
+
1679
+ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
1680
+ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
1681
+ const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
1682
+ const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
1683
+
1684
+ const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
1685
+ const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
1686
+ const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
1687
+ const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
1688
+
1689
+ const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
1690
+ const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
1691
+ const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
1692
+ const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
1693
+
1694
+ // load Q8 quants from block_q8_K.qs[QK_K]
1695
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1696
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1697
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1698
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1699
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1700
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1701
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1702
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1703
+
1704
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
1705
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
1706
+ // and 2 if the high bit was set)
1707
+ __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
1708
+ __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
1709
+ __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
1710
+ __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
1711
+ __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
1712
+ __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
1713
+ __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
1714
+ __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
1715
+
1716
+ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
1717
+ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
1718
+ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
1719
+ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
1720
+ __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
1721
+ __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
1722
+ __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
1723
+ __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
1724
+
1725
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
1726
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
1727
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
1728
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
1729
+ p16_4 = _mm_sub_epi16(p16_4, q8s_4);
1730
+ p16_5 = _mm_sub_epi16(p16_5, q8s_5);
1731
+ p16_6 = _mm_sub_epi16(p16_6, q8s_6);
1732
+ p16_7 = _mm_sub_epi16(p16_7, q8s_7);
1733
+
1734
+ // multiply with scales
1735
+ __m128i shuffle = _mm_set1_epi16(0x0100);
1736
+ p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
1737
+ shuffle = _mm_add_epi16(shuffle, m2);
1738
+ p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
1739
+ shuffle = _mm_add_epi16(shuffle, m2);
1740
+ p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
1741
+ shuffle = _mm_add_epi16(shuffle, m2);
1742
+ p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
1743
+ shuffle = _mm_add_epi16(shuffle, m2);
1744
+ p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
1745
+ shuffle = _mm_add_epi16(shuffle, m2);
1746
+ p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
1747
+ shuffle = _mm_add_epi16(shuffle, m2);
1748
+ p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
1749
+ shuffle = _mm_add_epi16(shuffle, m2);
1750
+ p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
1751
+
1752
+ // accumulate
1753
+ p16_0 = _mm_add_epi32(p16_0, p16_1);
1754
+ p16_2 = _mm_add_epi32(p16_2, p16_3);
1755
+ p16_4 = _mm_add_epi32(p16_4, p16_5);
1756
+ p16_6 = _mm_add_epi32(p16_6, p16_7);
1757
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
1758
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
1759
+
1760
+ }
1761
+
1762
+ // multiply with block scale and accumulate
1763
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1764
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
1765
+
1766
+ }
1767
+
1768
+ *s = hsum_float_8(acc);
1769
+
1770
+ #else
1771
+ // scalar version
1772
+ // This function is written like this so the compiler can manage to vectorize most of it
1773
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1774
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1775
+ // The ideal situation would be if we could just write the code once, and the compiler would
1776
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
1777
+ // write vectorized versions for AVX, ARM_NEON, etc.
1778
+
1779
+ int8_t aux8[QK_K];
1780
+ int16_t aux16[8];
1781
+ float sums [8];
1782
+ int32_t aux32[8];
1783
+ memset(sums, 0, 8*sizeof(float));
1784
+
1785
+ uint32_t auxs[4];
1786
+ const int8_t * scales = (const int8_t*)auxs;
1787
+
1788
+ float sumf = 0;
1789
+ for (int i = 0; i < nb; ++i) {
1790
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1791
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1792
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1793
+ memset(aux32, 0, 8*sizeof(int32_t));
1794
+ int8_t * GGML_RESTRICT a = aux8;
1795
+ uint8_t m = 1;
1796
+ for (int j = 0; j < QK_K; j += 128) {
1797
+ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1798
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1799
+ a += 32; m <<= 1;
1800
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1801
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1802
+ a += 32; m <<= 1;
1803
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1804
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1805
+ a += 32; m <<= 1;
1806
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1807
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1808
+ a += 32; m <<= 1;
1809
+ q3 += 32;
1810
+ }
1811
+ a = aux8;
1812
+
1813
+ memcpy(auxs, x[i].scales, 12);
1814
+ uint32_t tmp = auxs[2];
1815
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1816
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1817
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1818
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1819
+ for (int j = 0; j < QK_K/16; ++j) {
1820
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1821
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1822
+ q8 += 8; a += 8;
1823
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1824
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1825
+ q8 += 8; a += 8;
1826
+ }
1827
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1828
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1829
+ }
1830
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1831
+ *s = sumf;
1832
+
1833
+ #endif
1834
+
1835
+ }
1836
+
1837
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1838
+ assert(n % QK_K == 0);
1839
+ assert(nrc == 1);
1840
+ UNUSED(nrc);
1841
+ UNUSED(bx);
1842
+ UNUSED(by);
1843
+ UNUSED(bs);
1844
+
1845
+ const block_q4_K * GGML_RESTRICT x = vx;
1846
+ const block_q8_K * GGML_RESTRICT y = vy;
1847
+
1848
+ const int nb = n / QK_K;
1849
+
1850
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1851
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1852
+ static const uint32_t kmask3 = 0x03030303;
1853
+
1854
+ uint32_t utmp[4];
1855
+
1856
+ #if defined __AVX2__
1857
+
1858
+ const __m256i m4 = _mm256_set1_epi8(0xF);
1859
+
1860
+ __m256 acc = _mm256_setzero_ps();
1861
+ __m128 acc_m = _mm_setzero_ps();
1862
+
1863
+ for (int i = 0; i < nb; ++i) {
1864
+
1865
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1866
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1867
+
1868
+ memcpy(utmp, x[i].scales, 12);
1869
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1870
+ const uint32_t uaux = utmp[1] & kmask1;
1871
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1872
+ utmp[2] = uaux;
1873
+ utmp[0] &= kmask1;
1874
+
1875
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1876
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1877
+
1878
+ const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
1879
+
1880
+ const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
1881
+ const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
1882
+ const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
1883
+ acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
1884
+
1885
+ const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
1886
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
1887
+
1888
+ __m256i sumi = _mm256_setzero_si256();
1889
+
1890
+ for (int j = 0; j < QK_K/64; ++j) {
1891
+
1892
+ const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
1893
+ const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
1894
+
1895
+ const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
1896
+ const __m256i q4l = _mm256_and_si256(q4bits, m4);
1897
+ const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
1898
+
1899
+ const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1900
+ __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
1901
+ p16l = _mm256_madd_epi16(scale_l, p16l);
1902
+
1903
+ const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1904
+ __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
1905
+ p16h = _mm256_madd_epi16(scale_h, p16h);
1906
+ const __m256i sumj = _mm256_add_epi32(p16l, p16h);
1907
+
1908
+ sumi = _mm256_add_epi32(sumi, sumj);
1909
+ }
1910
+
1911
+ __m256 vd = _mm256_set1_ps(d);
1912
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
1913
+
1914
+ }
1915
+
1916
+ acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
1917
+ acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
1918
+
1919
+ *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
1920
+
1921
+ #elif defined __AVX__
1922
+
1923
+ const __m128i m4 = _mm_set1_epi8(0xF);
1924
+ const __m128i m2 = _mm_set1_epi8(0x2);
1925
+
1926
+ __m256 acc = _mm256_setzero_ps();
1927
+ __m128 acc_m = _mm_setzero_ps();
1928
+
1929
+ for (int i = 0; i < nb; ++i) {
1930
+
1931
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1932
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1933
+
1934
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1935
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1936
+
1937
+ memcpy(utmp, x[i].scales, 12);
1938
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1939
+ const uint32_t uaux = utmp[1] & kmask1;
1940
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1941
+ utmp[2] = uaux;
1942
+ utmp[0] &= kmask1;
1943
+
1944
+ const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
1945
+ const __m128i scales = _mm_cvtepu8_epi16(utmps);
1946
+ const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
1947
+
1948
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
1949
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
1950
+ const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
1951
+ const __m128i prod = _mm_madd_epi16(mins, q8s);
1952
+ acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
1953
+
1954
+ __m128i sumi_0 = _mm_setzero_si128();
1955
+ __m128i sumi_1 = _mm_setzero_si128();
1956
+
1957
+ __m128i shuffle = _mm_set1_epi16(0x0100);
1958
+ for (int j = 0; j < QK_K/64; ++j) {
1959
+
1960
+ const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
1961
+ shuffle = _mm_add_epi16(shuffle, m2);
1962
+ const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
1963
+ shuffle = _mm_add_epi16(shuffle, m2);
1964
+
1965
+ __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
1966
+ const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
1967
+ const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
1968
+ q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
1969
+ const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
1970
+ const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
1971
+
1972
+ const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1973
+ __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
1974
+ p16l = _mm_madd_epi16(scale_l, p16l);
1975
+ sumi_0 = _mm_add_epi32(sumi_0, p16l);
1976
+ const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1977
+ p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
1978
+ p16l = _mm_madd_epi16(scale_l, p16l);
1979
+ sumi_1 = _mm_add_epi32(sumi_1, p16l);
1980
+
1981
+ const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1982
+ __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
1983
+ p16h = _mm_madd_epi16(scale_h, p16h);
1984
+ sumi_0 = _mm_add_epi32(sumi_0, p16h);
1985
+ const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1986
+ p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
1987
+ p16h = _mm_madd_epi16(scale_h, p16h);
1988
+ sumi_1 = _mm_add_epi32(sumi_1, p16h);
1989
+
1990
+ }
1991
+
1992
+ __m256 vd = _mm256_set1_ps(d);
1993
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1994
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
1995
+
1996
+ }
1997
+
1998
+ acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
1999
+ acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
2000
+
2001
+ *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
2002
+
2003
+ #else
2004
+
2005
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
2006
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
2007
+
2008
+ int8_t aux8[QK_K];
2009
+ int16_t aux16[8];
2010
+ float sums [8];
2011
+ int32_t aux32[8];
2012
+ memset(sums, 0, 8*sizeof(float));
2013
+
2014
+ float sumf = 0;
2015
+ for (int i = 0; i < nb; ++i) {
2016
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2017
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2018
+ memset(aux32, 0, 8*sizeof(int32_t));
2019
+ int8_t * GGML_RESTRICT a = aux8;
2020
+ for (int j = 0; j < QK_K/64; ++j) {
2021
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2022
+ a += 32;
2023
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2024
+ a += 32; q4 += 32;
2025
+ }
2026
+ memcpy(utmp, x[i].scales, 12);
2027
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2028
+ const uint32_t uaux = utmp[1] & kmask1;
2029
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2030
+ utmp[2] = uaux;
2031
+ utmp[0] &= kmask1;
2032
+
2033
+ int sumi = 0;
2034
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2035
+ a = aux8;
2036
+ int is = 0;
2037
+ for (int j = 0; j < QK_K/32; ++j) {
2038
+ int32_t scale = scales[is++];
2039
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2040
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2041
+ q8 += 8; a += 8;
2042
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2043
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2044
+ q8 += 8; a += 8;
2045
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2046
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2047
+ q8 += 8; a += 8;
2048
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2049
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2050
+ q8 += 8; a += 8;
2051
+ }
2052
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2053
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2054
+ const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
2055
+ sumf -= dmin * sumi;
2056
+ }
2057
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
2058
+ *s = sumf;
2059
+ #endif
2060
+ }
2061
+
2062
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2063
+ assert(n % QK_K == 0);
2064
+ assert(nrc == 1);
2065
+ UNUSED(nrc);
2066
+ UNUSED(bx);
2067
+ UNUSED(by);
2068
+ UNUSED(bs);
2069
+
2070
+ const block_q5_K * GGML_RESTRICT x = vx;
2071
+ const block_q8_K * GGML_RESTRICT y = vy;
2072
+
2073
+ const int nb = n / QK_K;
2074
+
2075
+ static const uint32_t kmask1 = 0x3f3f3f3f;
2076
+ static const uint32_t kmask2 = 0x0f0f0f0f;
2077
+ static const uint32_t kmask3 = 0x03030303;
2078
+
2079
+ uint32_t utmp[4];
2080
+
2081
+ #if defined __AVX2__
2082
+
2083
+ const __m256i m4 = _mm256_set1_epi8(0xF);
2084
+ const __m128i mzero = _mm_setzero_si128();
2085
+ const __m256i mone = _mm256_set1_epi8(1);
2086
+
2087
+ __m256 acc = _mm256_setzero_ps();
2088
+
2089
+ float summs = 0.f;
2090
+
2091
+ for (int i = 0; i < nb; ++i) {
2092
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
2093
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2094
+
2095
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2096
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2097
+
2098
+ memcpy(utmp, x[i].scales, 12);
2099
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2100
+ const uint32_t uaux = utmp[1] & kmask1;
2101
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2102
+ utmp[2] = uaux;
2103
+ utmp[0] &= kmask1;
2104
+
2105
+ const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
2106
+
2107
+ const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
2108
+ const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
2109
+ const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
2110
+ const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
2111
+ summs += dmin * _mm_extract_epi32(hsum, 0);
2112
+
2113
+ const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2114
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
2115
+
2116
+ const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
2117
+ __m256i hmask = mone;
2118
+
2119
+ __m256i sumi = _mm256_setzero_si256();
2120
+
2121
+ int bit = 0;
2122
+
2123
+ for (int j = 0; j < QK_K/64; ++j) {
2124
+
2125
+ const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
2126
+ const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
2127
+
2128
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
2129
+
2130
+ const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
2131
+ const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
2132
+ const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
2133
+ hmask = _mm256_slli_epi16(hmask, 1);
2134
+
2135
+ const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
2136
+ const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
2137
+ const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
2138
+ hmask = _mm256_slli_epi16(hmask, 1);
2139
+
2140
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2141
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2142
+
2143
+ __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
2144
+ __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
2145
+
2146
+ p16_0 = _mm256_madd_epi16(scale_0, p16_0);
2147
+ p16_1 = _mm256_madd_epi16(scale_1, p16_1);
2148
+
2149
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
2150
+
2151
+ }
2152
+
2153
+ __m256 vd = _mm256_set1_ps(d);
2154
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
2155
+
2156
+ }
2157
+
2158
+ *s = hsum_float_8(acc) + summs;
2159
+
2160
+ #elif defined __AVX__
2161
+
2162
+ const __m128i m4 = _mm_set1_epi8(0xF);
2163
+ const __m128i mzero = _mm_setzero_si128();
2164
+ const __m128i mone = _mm_set1_epi8(1);
2165
+ const __m128i m2 = _mm_set1_epi8(2);
2166
+
2167
+ __m256 acc = _mm256_setzero_ps();
2168
+
2169
+ float summs = 0.f;
2170
+
2171
+ for (int i = 0; i < nb; ++i) {
2172
+
2173
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2174
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2175
+
2176
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
2177
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2178
+
2179
+ memcpy(utmp, x[i].scales, 12);
2180
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2181
+ const uint32_t uaux = utmp[1] & kmask1;
2182
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2183
+ utmp[2] = uaux;
2184
+ utmp[0] &= kmask1;
2185
+
2186
+ const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
2187
+ const __m128i scales = _mm_cvtepu8_epi16(utmps);
2188
+ const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
2189
+
2190
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
2191
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
2192
+ const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
2193
+ const __m128i prod = _mm_madd_epi16(mins, q8s);
2194
+ const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
2195
+ summs += dmin * _mm_extract_epi32(hsum, 0);
2196
+
2197
+ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
2198
+ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
2199
+ __m128i hmask = mone;
2200
+
2201
+ __m128i sumi_0 = _mm_setzero_si128();
2202
+ __m128i sumi_1 = _mm_setzero_si128();
2203
+
2204
+ int bit = 0;
2205
+
2206
+ __m128i shuffle = _mm_set1_epi16(0x0100);
2207
+ for (int j = 0; j < QK_K/64; ++j) {
2208
+
2209
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
2210
+ shuffle = _mm_add_epi16(shuffle, m2);
2211
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
2212
+ shuffle = _mm_add_epi16(shuffle, m2);
2213
+
2214
+ const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
2215
+ const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
2216
+
2217
+ __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
2218
+ __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
2219
+ __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
2220
+ __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
2221
+ __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0);
2222
+ __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1);
2223
+ hmask = _mm_slli_epi16(hmask, 1);
2224
+
2225
+ __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2226
+ __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2227
+ __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
2228
+ __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
2229
+ p16_0 = _mm_madd_epi16(scale_0, p16_0);
2230
+ p16_1 = _mm_madd_epi16(scale_0, p16_1);
2231
+
2232
+ q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
2233
+ q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
2234
+ q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
2235
+ q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
2236
+ q5_0 = _mm_add_epi8(q5l_0, q5h_0);
2237
+ q5_1 = _mm_add_epi8(q5l_1, q5h_1);
2238
+ hmask = _mm_slli_epi16(hmask, 1);
2239
+
2240
+ q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2241
+ q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2242
+ __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
2243
+ __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
2244
+ p16_2 = _mm_madd_epi16(scale_1, p16_2);
2245
+ p16_3 = _mm_madd_epi16(scale_1, p16_3);
2246
+
2247
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
2248
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
2249
+
2250
+ }
2251
+
2252
+ __m256 vd = _mm256_set1_ps(d);
2253
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2254
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2255
+
2256
+ }
2257
+
2258
+ *s = hsum_float_8(acc) + summs;
2259
+
2260
+ #else
2261
+
2262
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
2263
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
2264
+
2265
+ int8_t aux8[QK_K];
2266
+ int16_t aux16[8];
2267
+ float sums [8];
2268
+ int32_t aux32[8];
2269
+ memset(sums, 0, 8*sizeof(float));
2270
+
2271
+ float sumf = 0;
2272
+ for (int i = 0; i < nb; ++i) {
2273
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2274
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
2275
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2276
+ memset(aux32, 0, 8*sizeof(int32_t));
2277
+ int8_t * GGML_RESTRICT a = aux8;
2278
+ uint8_t m = 1;
2279
+ for (int j = 0; j < QK_K/64; ++j) {
2280
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2281
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2282
+ a += 32; m <<= 1;
2283
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2284
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2285
+ a += 32; m <<= 1;
2286
+ q4 += 32;
2287
+ }
2288
+ memcpy(utmp, x[i].scales, 12);
2289
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2290
+ const uint32_t uaux = utmp[1] & kmask1;
2291
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2292
+ utmp[2] = uaux;
2293
+ utmp[0] &= kmask1;
2294
+
2295
+ int sumi = 0;
2296
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2297
+ a = aux8;
2298
+ int is = 0;
2299
+ for (int j = 0; j < QK_K/32; ++j) {
2300
+ int32_t scale = scales[is++];
2301
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2302
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2303
+ q8 += 8; a += 8;
2304
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2305
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2306
+ q8 += 8; a += 8;
2307
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2308
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2309
+ q8 += 8; a += 8;
2310
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2311
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2312
+ q8 += 8; a += 8;
2313
+ }
2314
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2315
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2316
+ const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
2317
+ sumf -= dmin * sumi;
2318
+ }
2319
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
2320
+ *s = sumf;
2321
+ #endif
2322
+ }
2323
+
2324
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2325
+ assert(n % QK_K == 0);
2326
+ assert(nrc == 1);
2327
+ UNUSED(nrc);
2328
+ UNUSED(bx);
2329
+ UNUSED(by);
2330
+ UNUSED(bs);
2331
+
2332
+ const block_q6_K * GGML_RESTRICT x = vx;
2333
+ const block_q8_K * GGML_RESTRICT y = vy;
2334
+
2335
+ const int nb = n / QK_K;
2336
+
2337
+ #if defined __AVX2__
2338
+
2339
+ const __m256i m4 = _mm256_set1_epi8(0xF);
2340
+ const __m256i m2 = _mm256_set1_epi8(3);
2341
+ const __m256i m32s = _mm256_set1_epi8(32);
2342
+
2343
+ __m256 acc = _mm256_setzero_ps();
2344
+
2345
+ for (int i = 0; i < nb; ++i) {
2346
+
2347
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2348
+
2349
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2350
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2351
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2352
+
2353
+ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
2354
+
2355
+ __m256i sumi = _mm256_setzero_si256();
2356
+
2357
+ int is = 0;
2358
+
2359
+ for (int j = 0; j < QK_K/128; ++j) {
2360
+
2361
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
2362
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
2363
+ const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
2364
+ const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
2365
+ is += 4;
2366
+
2367
+ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
2368
+ const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
2369
+ const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
2370
+
2371
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
2372
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
2373
+ const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
2374
+ const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
2375
+
2376
+ const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
2377
+ const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
2378
+ const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
2379
+ const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
2380
+
2381
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2382
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2383
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2384
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2385
+
2386
+ __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
2387
+ __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
2388
+ __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
2389
+ __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
2390
+
2391
+ __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
2392
+ __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
2393
+ __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
2394
+ __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
2395
+
2396
+ p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
2397
+ p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
2398
+ p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
2399
+ p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
2400
+
2401
+ p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
2402
+ p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
2403
+ p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
2404
+ p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
2405
+
2406
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
2407
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
2408
+
2409
+ }
2410
+
2411
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
2412
+ }
2413
+
2414
+ *s = hsum_float_8(acc);
2415
+
2416
+ #elif defined __AVX__
2417
+
2418
+ const __m128i m3 = _mm_set1_epi8(3);
2419
+ const __m128i m15 = _mm_set1_epi8(15);
2420
+
2421
+ __m256 acc = _mm256_setzero_ps();
2422
+
2423
+ for (int i = 0; i < nb; ++i) {
2424
+
2425
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2426
+
2427
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2428
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2429
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2430
+
2431
+ // handle the q6_k -32 offset separately using bsums
2432
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
2433
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
2434
+ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
2435
+ const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
2436
+ const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
2437
+ const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
2438
+ const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
2439
+
2440
+ __m128i sumi_0 = _mm_setzero_si128();
2441
+ __m128i sumi_1 = _mm_setzero_si128();
2442
+
2443
+ int is = 0;
2444
+
2445
+ for (int j = 0; j < QK_K/128; ++j) {
2446
+
2447
+ const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
2448
+ const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
2449
+
2450
+ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
2451
+ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
2452
+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
2453
+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
2454
+ const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
2455
+ const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
2456
+ const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
2457
+ const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
2458
+
2459
+ const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2460
+ const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2461
+ const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2462
+ const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2463
+
2464
+ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
2465
+ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
2466
+ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
2467
+ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
2468
+ const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
2469
+ const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
2470
+ const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
2471
+ const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
2472
+
2473
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2474
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2475
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2476
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2477
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2478
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2479
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2480
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2481
+
2482
+ __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
2483
+ __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
2484
+ __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
2485
+ __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
2486
+ __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
2487
+ __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
2488
+ __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
2489
+ __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
2490
+
2491
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
2492
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
2493
+ const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
2494
+ const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
2495
+ is += 4;
2496
+
2497
+ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
2498
+ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
2499
+ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
2500
+ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
2501
+ p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
2502
+ p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
2503
+ p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
2504
+ p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
2505
+
2506
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
2507
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
2508
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
2509
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
2510
+
2511
+ }
2512
+
2513
+ sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
2514
+ sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
2515
+ const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2516
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
2517
+ }
2518
+
2519
+ *s = hsum_float_8(acc);
2520
+
2521
+ #else
2522
+
2523
+ int8_t aux8[QK_K];
2524
+ int16_t aux16[8];
2525
+ float sums [8];
2526
+ int32_t aux32[8];
2527
+ memset(sums, 0, 8*sizeof(float));
2528
+
2529
+ float sumf = 0;
2530
+ for (int i = 0; i < nb; ++i) {
2531
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2532
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2533
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2534
+ memset(aux32, 0, 8*sizeof(int32_t));
2535
+ int8_t * GGML_RESTRICT a = aux8;
2536
+ for (int j = 0; j < QK_K; j += 128) {
2537
+ for (int l = 0; l < 32; ++l) {
2538
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2539
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2540
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2541
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2542
+ }
2543
+ a += 128;
2544
+ q4 += 64;
2545
+ qh += 32;
2546
+ }
2547
+ a = aux8;
2548
+ int is = 0;
2549
+ for (int j = 0; j < QK_K/16; ++j) {
2550
+ int scale = x[i].scales[is++];
2551
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2552
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2553
+ q8 += 8; a += 8;
2554
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2555
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2556
+ q8 += 8; a += 8;
2557
+ }
2558
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2559
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2560
+ }
2561
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
2562
+ *s = sumf;
2563
+ #endif
2564
+ }
2565
+
2566
+ #if defined (__AVX__) || defined (__AVX2__)
2567
+ static const int8_t keven_signs_q2xs[1024] = {
2568
+ 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
2569
+ 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
2570
+ 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
2571
+ 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
2572
+ 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
2573
+ 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
2574
+ 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
2575
+ 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
2576
+ 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
2577
+ 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
2578
+ 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
2579
+ 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
2580
+ 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
2581
+ 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
2582
+ 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
2583
+ 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
2584
+ 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
2585
+ 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
2586
+ 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
2587
+ 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
2588
+ 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
2589
+ 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
2590
+ 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
2591
+ 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
2592
+ 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
2593
+ 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
2594
+ 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
2595
+ 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
2596
+ 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
2597
+ 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
2598
+ 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
2599
+ 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
2600
+ };
2601
+ #endif
2602
+
2603
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2604
+ assert(n % QK_K == 0);
2605
+ assert(nrc == 1);
2606
+ UNUSED(nrc);
2607
+ UNUSED(bx);
2608
+ UNUSED(by);
2609
+ UNUSED(bs);
2610
+
2611
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
2612
+ const block_q8_K * GGML_RESTRICT y = vy;
2613
+
2614
+ const int nb = n / QK_K;
2615
+
2616
+ #if defined(__AVX2__)
2617
+
2618
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2619
+
2620
+ uint32_t aux32[4];
2621
+ const uint8_t * aux8 = (const uint8_t *)aux32;
2622
+
2623
+ __m256 accumf = _mm256_setzero_ps();
2624
+ for (int i = 0; i < nb; ++i) {
2625
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2626
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2627
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2628
+ __m256i sumi1 = _mm256_setzero_si256();
2629
+ __m256i sumi2 = _mm256_setzero_si256();
2630
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2631
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2632
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2633
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
2634
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
2635
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
2636
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
2637
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
2638
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
2639
+ signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
2640
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
2641
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
2642
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
2643
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
2644
+ const uint16_t ls1 = aux32[1] >> 28;
2645
+ const uint16_t ls2 = aux32[3] >> 28;
2646
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
2647
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
2648
+ sumi1 = _mm256_add_epi32(sumi1, p1);
2649
+ sumi2 = _mm256_add_epi32(sumi2, p2);
2650
+ }
2651
+
2652
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
2653
+
2654
+ }
2655
+
2656
+ *s = 0.125f * hsum_float_8(accumf);
2657
+
2658
+ #elif defined(__AVX__)
2659
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2660
+
2661
+ uint32_t aux32[4];
2662
+ const uint8_t * aux8 = (const uint8_t *)aux32;
2663
+
2664
+ __m256 accumf = _mm256_setzero_ps();
2665
+ for (int i = 0; i < nb; ++i) {
2666
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2667
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2668
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2669
+ __m128i sumi1_0 = _mm_setzero_si128();
2670
+ __m128i sumi1_1 = _mm_setzero_si128();
2671
+ __m128i sumi2_0 = _mm_setzero_si128();
2672
+ __m128i sumi2_1 = _mm_setzero_si128();
2673
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2674
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2675
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2676
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2677
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2678
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
2679
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
2680
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
2681
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
2682
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
2683
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
2684
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
2685
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
2686
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
2687
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
2688
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
2689
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
2690
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
2691
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
2692
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
2693
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
2694
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
2695
+ const uint16_t ls1 = aux32[1] >> 28;
2696
+ const uint16_t ls2 = aux32[3] >> 28;
2697
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
2698
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
2699
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
2700
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
2701
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
2702
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
2703
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
2704
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
2705
+ }
2706
+
2707
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
2708
+
2709
+ }
2710
+
2711
+ *s = 0.125f * hsum_float_8(accumf);
2712
+
2713
+ #else
2714
+
2715
+ uint32_t aux32[2];
2716
+ const uint8_t * aux8 = (const uint8_t *)aux32;
2717
+
2718
+ float sumf = 0.f;
2719
+ for (int i = 0; i < nb; ++i) {
2720
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2721
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2722
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2723
+ int32_t bsum = 0;
2724
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2725
+ memcpy(aux32, q2, 2*sizeof(uint32_t));
2726
+ q2 += 4;
2727
+ const uint32_t ls = 2*(aux32[1] >> 28) + 1;
2728
+ int32_t sumi = 0;
2729
+ for (int l = 0; l < 4; ++l) {
2730
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
2731
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
2732
+ for (int j = 0; j < 8; ++j) {
2733
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
2734
+ }
2735
+ q8 += 8;
2736
+ }
2737
+ bsum += sumi * ls;
2738
+ }
2739
+ sumf += d * bsum;
2740
+ }
2741
+ *s = 0.125f * sumf;
2742
+ #endif
2743
+ }
2744
+
2745
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2746
+ assert(n % QK_K == 0);
2747
+ assert(nrc == 1);
2748
+ UNUSED(nrc);
2749
+ UNUSED(bx);
2750
+ UNUSED(by);
2751
+ UNUSED(bs);
2752
+
2753
+ const block_iq2_xs * GGML_RESTRICT x = vx;
2754
+ const block_q8_K * GGML_RESTRICT y = vy;
2755
+
2756
+ const int nb = n / QK_K;
2757
+
2758
+ #if defined(__AVX2__)
2759
+
2760
+ const __m256i mone = _mm256_set1_epi8(1);
2761
+ static const char block_sign_shuffle_mask_1[32] = {
2762
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
2763
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
2764
+ };
2765
+ static const char block_sign_shuffle_mask_2[32] = {
2766
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
2767
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
2768
+ };
2769
+ static const uint8_t bit_selector_mask_bytes[32] = {
2770
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2771
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2772
+ };
2773
+
2774
+ const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
2775
+ const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
2776
+ const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
2777
+
2778
+ static const uint8_t k_bit_helper[32] = {
2779
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2780
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2781
+ };
2782
+ const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
2783
+ const __m256i m511 = _mm256_set1_epi16(511);
2784
+ const __m128i m4 = _mm_set1_epi8(0xf);
2785
+ const __m128i m1 = _mm_set1_epi8(1);
2786
+
2787
+ uint64_t aux64;
2788
+
2789
+ // somewhat hacky, but gives a significant boost in performance
2790
+ __m256i aux_gindex;
2791
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
2792
+
2793
+ __m256 accumf = _mm256_setzero_ps();
2794
+ for (int i = 0; i < nb; ++i) {
2795
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2796
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2797
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2798
+
2799
+ memcpy(&aux64, x[i].scales, 8);
2800
+ __m128i stmp = _mm_set1_epi64x(aux64);
2801
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
2802
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
2803
+
2804
+ __m256i sumi1 = _mm256_setzero_si256();
2805
+ __m256i sumi2 = _mm256_setzero_si256();
2806
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
2807
+
2808
+ const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16;
2809
+ aux_gindex = _mm256_and_si256(q2_data, m511);
2810
+
2811
+ const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
2812
+ const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
2813
+ const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
2814
+
2815
+ const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
2816
+ const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
2817
+
2818
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2819
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2820
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2821
+ const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
2822
+
2823
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
2824
+ iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
2825
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
2826
+ iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
2827
+ const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
2828
+ iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
2829
+ const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
2830
+ iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
2831
+
2832
+ const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
2833
+ const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
2834
+ const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
2835
+ const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
2836
+
2837
+ __m256i signs;
2838
+ signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
2839
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2840
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
2841
+
2842
+ signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
2843
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2844
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
2845
+
2846
+ signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
2847
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2848
+ const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
2849
+
2850
+ signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
2851
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
2852
+ const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
2853
+
2854
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
2855
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
2856
+ const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3);
2857
+ const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4);
2858
+
2859
+ const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
2860
+ const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
2861
+ const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
2862
+ const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
2863
+
2864
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
2865
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
2866
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
2867
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
2868
+ }
2869
+
2870
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
2871
+
2872
+ }
2873
+
2874
+ *s = 0.125f * hsum_float_8(accumf);
2875
+
2876
+ #elif defined(__AVX__)
2877
+ const __m128i mone = _mm_set1_epi8(1);
2878
+ static const char block_sign_shuffle_mask_1[32] = {
2879
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
2880
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
2881
+ };
2882
+ static const char block_sign_shuffle_mask_2[32] = {
2883
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
2884
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
2885
+ };
2886
+ static const uint8_t bit_selector_mask_bytes[32] = {
2887
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2888
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2889
+ };
2890
+
2891
+ const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
2892
+ const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
2893
+ const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
2894
+ const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
2895
+ const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
2896
+ const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
2897
+
2898
+ static const uint8_t k_bit_helper[32] = {
2899
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2900
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
2901
+ };
2902
+ const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
2903
+ const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
2904
+ const __m128i m511 = _mm_set1_epi16(511);
2905
+ const __m128i m4 = _mm_set1_epi8(0xf);
2906
+ const __m128i m1 = _mm_set1_epi8(1);
2907
+
2908
+ uint64_t aux64;
2909
+
2910
+ // somewhat hacky, but gives a significant boost in performance
2911
+ __m256i aux_gindex;
2912
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
2913
+
2914
+ __m256 accumf = _mm256_setzero_ps();
2915
+ for (int i = 0; i < nb; ++i) {
2916
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2917
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2918
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2919
+
2920
+ memcpy(&aux64, x[i].scales, 8);
2921
+ __m128i stmp = _mm_set1_epi64x(aux64);
2922
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
2923
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
2924
+
2925
+ __m128i sumi1_0 = _mm_setzero_si128();
2926
+ __m128i sumi1_1 = _mm_setzero_si128();
2927
+ __m128i sumi2_0 = _mm_setzero_si128();
2928
+ __m128i sumi2_1 = _mm_setzero_si128();
2929
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
2930
+
2931
+ const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
2932
+ const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
2933
+ aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
2934
+
2935
+ const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
2936
+ const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
2937
+ const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
2938
+ const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
2939
+ const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
2940
+ const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
2941
+
2942
+ const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
2943
+ const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
2944
+ const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
2945
+ const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
2946
+
2947
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2948
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2949
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2950
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2951
+ const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2952
+ const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2953
+ const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2954
+ const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
2955
+
2956
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
2957
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
2958
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
2959
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
2960
+ const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
2961
+ const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
2962
+ const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
2963
+ const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
2964
+
2965
+ // AVX2 full_signs_1 is full_sign_bits_0 here
2966
+ // AVX2 full_signs_2 is full_sign_bits_1 here
2967
+ __m128i signs_0, signs_1;
2968
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
2969
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
2970
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2971
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2972
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
2973
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
2974
+
2975
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
2976
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
2977
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2978
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2979
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
2980
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
2981
+
2982
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
2983
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
2984
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2985
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2986
+ const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
2987
+ const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
2988
+
2989
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
2990
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
2991
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
2992
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
2993
+ const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
2994
+ const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
2995
+
2996
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
2997
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
2998
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
2999
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
3000
+ const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
3001
+ const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
3002
+ const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
3003
+ const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
3004
+
3005
+ __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
3006
+ const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
3007
+ const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
3008
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
3009
+ const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
3010
+ const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
3011
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
3012
+ const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
3013
+ const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
3014
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
3015
+ const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
3016
+ const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
3017
+
3018
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
3019
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
3020
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
3021
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
3022
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
3023
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
3024
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
3025
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
3026
+ }
3027
+
3028
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
3029
+
3030
+ }
3031
+
3032
+ *s = 0.125f * hsum_float_8(accumf);
3033
+
3034
+ #else
3035
+
3036
+ float sumf = 0.f;
3037
+ for (int i = 0; i < nb; ++i) {
3038
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3039
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3040
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
3041
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3042
+ int32_t bsum = 0;
3043
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3044
+ const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
3045
+ const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
3046
+ int32_t sumi = 0;
3047
+ for (int l = 0; l < 2; ++l) {
3048
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3049
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3050
+ for (int j = 0; j < 8; ++j) {
3051
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3052
+ }
3053
+ q8 += 8;
3054
+ }
3055
+ bsum += sumi * ls1;
3056
+ sumi = 0;
3057
+ for (int l = 2; l < 4; ++l) {
3058
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3059
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3060
+ for (int j = 0; j < 8; ++j) {
3061
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3062
+ }
3063
+ q8 += 8;
3064
+ }
3065
+ bsum += sumi * ls2;
3066
+ q2 += 4;
3067
+ }
3068
+ sumf += d * bsum;
3069
+ }
3070
+ *s = 0.125f * sumf;
3071
+ #endif
3072
+ }
3073
+
3074
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3075
+ assert(n % QK_K == 0);
3076
+ assert(nrc == 1);
3077
+ UNUSED(nrc);
3078
+ UNUSED(bx);
3079
+ UNUSED(by);
3080
+ UNUSED(bs);
3081
+
3082
+ const block_iq2_s * GGML_RESTRICT x = vx;
3083
+ const block_q8_K * GGML_RESTRICT y = vy;
3084
+
3085
+ const int nb = n / QK_K;
3086
+
3087
+ #if defined(__AVX2__)
3088
+
3089
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
3090
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
3091
+ };
3092
+
3093
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3094
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3095
+ };
3096
+
3097
+ const __m128i m4 = _mm_set1_epi8(0xf);
3098
+ const __m128i m1 = _mm_set1_epi8(1);
3099
+
3100
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
3101
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
3102
+
3103
+ uint64_t aux64;
3104
+
3105
+ __m256 accumf = _mm256_setzero_ps();
3106
+ for (int i = 0; i < nb; ++i) {
3107
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3108
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3109
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3110
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
3111
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3112
+
3113
+ memcpy(&aux64, x[i].scales, 8);
3114
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
3115
+ const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
3116
+
3117
+ __m256i sumi1 = _mm256_setzero_si256();
3118
+ __m256i sumi2 = _mm256_setzero_si256();
3119
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3120
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3121
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3122
+ const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
3123
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
3124
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
3125
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
3126
+ const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
3127
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
3128
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
3129
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
3130
+ qs += 8;
3131
+
3132
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
3133
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
3134
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
3135
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
3136
+
3137
+ aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
3138
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
3139
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
3140
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
3141
+
3142
+ signs += 4;
3143
+
3144
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
3145
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
3146
+
3147
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
3148
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
3149
+ sumi1 = _mm256_add_epi32(sumi1, p1);
3150
+ sumi2 = _mm256_add_epi32(sumi2, p2);
3151
+ }
3152
+
3153
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
3154
+
3155
+ }
3156
+
3157
+ *s = 0.125f * hsum_float_8(accumf);
3158
+
3159
+ #elif defined(__AVX__)
3160
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
3161
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
3162
+ };
3163
+
3164
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3165
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3166
+ };
3167
+
3168
+ const __m128i m4 = _mm_set1_epi8(0xf);
3169
+ const __m128i m1 = _mm_set1_epi8(1);
3170
+
3171
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
3172
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
3173
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
3174
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
3175
+
3176
+ uint64_t aux64;
3177
+
3178
+ __m256 accumf = _mm256_setzero_ps();
3179
+ for (int i = 0; i < nb; ++i) {
3180
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3181
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3182
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3183
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
3184
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3185
+
3186
+ memcpy(&aux64, x[i].scales, 8);
3187
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
3188
+ const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
3189
+ const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
3190
+
3191
+ __m128i sumi1_0 = _mm_setzero_si128();
3192
+ __m128i sumi1_1 = _mm_setzero_si128();
3193
+ __m128i sumi2_0 = _mm_setzero_si128();
3194
+ __m128i sumi2_1 = _mm_setzero_si128();
3195
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3196
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3197
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3198
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3199
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3200
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
3201
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
3202
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
3203
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
3204
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
3205
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
3206
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
3207
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
3208
+ qs += 8;
3209
+
3210
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
3211
+ __m128i aux128_1 = aux128_0;
3212
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
3213
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
3214
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
3215
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
3216
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
3217
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
3218
+
3219
+ aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
3220
+ aux128_1 = aux128_0;
3221
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
3222
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
3223
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
3224
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
3225
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
3226
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
3227
+
3228
+ signs += 4;
3229
+
3230
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
3231
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
3232
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
3233
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
3234
+
3235
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
3236
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
3237
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
3238
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
3239
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
3240
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
3241
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
3242
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
3243
+ }
3244
+
3245
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
3246
+
3247
+ }
3248
+
3249
+ *s = 0.125f * hsum_float_8(accumf);
3250
+
3251
+ #else
3252
+
3253
+ float sumf = 0;
3254
+ for (int i = 0; i < nb; i++) {
3255
+
3256
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3257
+ const int8_t * q8 = y[i].qs;
3258
+ const uint8_t * qs = x[i].qs;
3259
+ const uint8_t * qh = x[i].qh;
3260
+ const uint8_t * signs = qs + QK_K/8;
3261
+
3262
+ int bsum = 0;
3263
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3264
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
3265
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
3266
+ int sumi1 = 0, sumi2 = 0;
3267
+ for (int l = 0; l < 2; ++l) {
3268
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3269
+ for (int j = 0; j < 8; ++j) {
3270
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3271
+ }
3272
+ q8 += 8;
3273
+ }
3274
+ for (int l = 2; l < 4; ++l) {
3275
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3276
+ for (int j = 0; j < 8; ++j) {
3277
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3278
+ }
3279
+ q8 += 8;
3280
+ }
3281
+ bsum += ls1 * sumi1 + ls2 * sumi2;
3282
+ qs += 4;
3283
+ signs += 4;
3284
+ }
3285
+
3286
+ sumf += d * bsum;
3287
+ }
3288
+
3289
+ *s = 0.125f * sumf;
3290
+
3291
+ #endif
3292
+
3293
+ }
3294
+
3295
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3296
+ assert(n % QK_K == 0);
3297
+ assert(nrc == 1);
3298
+ UNUSED(nrc);
3299
+ UNUSED(bx);
3300
+ UNUSED(by);
3301
+ UNUSED(bs);
3302
+
3303
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
3304
+ const block_q8_K * GGML_RESTRICT y = vy;
3305
+
3306
+ const int nb = n / QK_K;
3307
+
3308
+ #if defined(__AVX2__)
3309
+
3310
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3311
+
3312
+ uint32_t aux32[2];
3313
+
3314
+ __m256 accumf = _mm256_setzero_ps();
3315
+ for (int i = 0; i < nb; ++i) {
3316
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3317
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3318
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3319
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3320
+ __m256i sumi1 = _mm256_setzero_si256();
3321
+ __m256i sumi2 = _mm256_setzero_si256();
3322
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3323
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3324
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3325
+ const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
3326
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3327
+ q3 += 8;
3328
+ const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
3329
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3330
+ q3 += 8;
3331
+ memcpy(aux32, gas, 8); gas += 8;
3332
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
3333
+ signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
3334
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
3335
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
3336
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
3337
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
3338
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
3339
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
3340
+ const uint16_t ls1 = aux32[0] >> 28;
3341
+ const uint16_t ls2 = aux32[1] >> 28;
3342
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
3343
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
3344
+ sumi1 = _mm256_add_epi32(sumi1, p1);
3345
+ sumi2 = _mm256_add_epi32(sumi2, p2);
3346
+ }
3347
+
3348
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
3349
+
3350
+ }
3351
+
3352
+ *s = 0.25f * hsum_float_8(accumf);
3353
+
3354
+ #elif defined(__AVX__)
3355
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3356
+
3357
+ uint32_t aux32[2];
3358
+
3359
+ __m256 accumf = _mm256_setzero_ps();
3360
+ for (int i = 0; i < nb; ++i) {
3361
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3362
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3363
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3364
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3365
+ __m128i sumi1_0 = _mm_setzero_si128();
3366
+ __m128i sumi1_1 = _mm_setzero_si128();
3367
+ __m128i sumi2_0 = _mm_setzero_si128();
3368
+ __m128i sumi2_1 = _mm_setzero_si128();
3369
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3370
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3371
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3372
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3373
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3374
+ const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3375
+ const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
3376
+ q3 += 8;
3377
+ const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
3378
+ const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
3379
+ q3 += 8;
3380
+ memcpy(aux32, gas, 8); gas += 8;
3381
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
3382
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
3383
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
3384
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
3385
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
3386
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
3387
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
3388
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
3389
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
3390
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
3391
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
3392
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
3393
+ const uint16_t ls1 = aux32[0] >> 28;
3394
+ const uint16_t ls2 = aux32[1] >> 28;
3395
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
3396
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
3397
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
3398
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
3399
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
3400
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
3401
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
3402
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
3403
+ }
3404
+
3405
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
3406
+
3407
+ }
3408
+
3409
+ *s = 0.25f * hsum_float_8(accumf);
3410
+
3411
+ #else
3412
+
3413
+ uint32_t aux32;
3414
+
3415
+ float sumf = 0.f;
3416
+ for (int i = 0; i < nb; ++i) {
3417
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3418
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3419
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3420
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3421
+ int32_t bsum = 0;
3422
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3423
+ memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
3424
+ const uint32_t ls = 2*(aux32 >> 28) + 1;
3425
+ int32_t sumi = 0;
3426
+ for (int l = 0; l < 4; ++l) {
3427
+ const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
3428
+ const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
3429
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
3430
+ for (int j = 0; j < 4; ++j) {
3431
+ sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
3432
+ sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
3433
+ }
3434
+ q8 += 8;
3435
+ }
3436
+ q3 += 8;
3437
+ bsum += sumi * ls;
3438
+ }
3439
+ sumf += d * bsum;
3440
+ }
3441
+ *s = 0.25f * sumf;
3442
+ #endif
3443
+ }
3444
+
3445
+ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3446
+ assert(n % QK_K == 0);
3447
+ assert(nrc == 1);
3448
+ UNUSED(nrc);
3449
+ UNUSED(bx);
3450
+ UNUSED(by);
3451
+ UNUSED(bs);
3452
+
3453
+ const block_iq3_s * GGML_RESTRICT x = vx;
3454
+ const block_q8_K * GGML_RESTRICT y = vy;
3455
+
3456
+ const int nb = n / QK_K;
3457
+
3458
+ #if defined(__AVX2__)
3459
+
3460
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
3461
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
3462
+ };
3463
+
3464
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3465
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3466
+ };
3467
+
3468
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
3469
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
3470
+
3471
+ const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3472
+ const __m256i idx_mask = _mm256_set1_epi32(256);
3473
+
3474
+ typedef union {
3475
+ __m256i vec[2];
3476
+ uint32_t index[16];
3477
+ } index_t;
3478
+
3479
+ index_t idx;
3480
+
3481
+ __m256 accumf = _mm256_setzero_ps();
3482
+ for (int i = 0; i < nb; ++i) {
3483
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3484
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3485
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3486
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
3487
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3488
+ __m256i sumi1 = _mm256_setzero_si256();
3489
+ __m256i sumi2 = _mm256_setzero_si256();
3490
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3491
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3492
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
3493
+ const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
3494
+ idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
3495
+ idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
3496
+ idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
3497
+ idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
3498
+ idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
3499
+ idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
3500
+
3501
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
3502
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
3503
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
3504
+ const __m256i q2_1 = _mm256_set_epi32(
3505
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
3506
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
3507
+ );
3508
+ const __m256i q2_2 = _mm256_set_epi32(
3509
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
3510
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
3511
+ );
3512
+
3513
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
3514
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
3515
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
3516
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
3517
+
3518
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
3519
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
3520
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
3521
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
3522
+
3523
+ signs += 4;
3524
+
3525
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
3526
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
3527
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
3528
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
3529
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
3530
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
3531
+ sumi1 = _mm256_add_epi32(sumi1, p1);
3532
+ sumi2 = _mm256_add_epi32(sumi2, p2);
3533
+ }
3534
+
3535
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
3536
+
3537
+ }
3538
+
3539
+ *s = hsum_float_8(accumf);
3540
+
3541
+ #elif defined(__AVX__)
3542
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
3543
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
3544
+ };
3545
+
3546
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3547
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
3548
+ };
3549
+
3550
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
3551
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
3552
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
3553
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
3554
+
3555
+ const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
3556
+ const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
3557
+ const __m128i idx_mask = _mm_set1_epi32(256);
3558
+
3559
+ typedef union {
3560
+ __m128i vec[4];
3561
+ uint32_t index[16];
3562
+ } index_t;
3563
+
3564
+ index_t idx;
3565
+
3566
+ __m256 accumf = _mm256_setzero_ps();
3567
+ for (int i = 0; i < nb; ++i) {
3568
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3569
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3570
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3571
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
3572
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3573
+ __m128i sumi1_0 = _mm_setzero_si128();
3574
+ __m128i sumi1_1 = _mm_setzero_si128();
3575
+ __m128i sumi2_0 = _mm_setzero_si128();
3576
+ __m128i sumi2_1 = _mm_setzero_si128();
3577
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3578
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3579
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3580
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3581
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3582
+ const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
3583
+ const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
3584
+ const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
3585
+ idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
3586
+ idx.vec[1] = idx.vec[0];
3587
+ idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
3588
+ idx.vec[3] = idx.vec[2];
3589
+
3590
+ idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
3591
+ idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
3592
+ idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
3593
+ idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
3594
+
3595
+ idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
3596
+ idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
3597
+ idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
3598
+ idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
3599
+
3600
+ const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
3601
+ const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
3602
+ const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
3603
+ const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
3604
+
3605
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
3606
+ __m128i aux128_1 = aux128_0;
3607
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
3608
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
3609
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
3610
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
3611
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
3612
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
3613
+
3614
+ aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
3615
+ aux128_1 = aux128_0;
3616
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
3617
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
3618
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
3619
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
3620
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
3621
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
3622
+
3623
+ signs += 4;
3624
+
3625
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
3626
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
3627
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
3628
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
3629
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
3630
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
3631
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
3632
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
3633
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
3634
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
3635
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
3636
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
3637
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
3638
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
3639
+ }
3640
+
3641
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
3642
+
3643
+ }
3644
+
3645
+ *s = hsum_float_8(accumf);
3646
+
3647
+ #else
3648
+
3649
+ float sumf = 0.f;
3650
+ for (int i = 0; i < nb; ++i) {
3651
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3652
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3653
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3654
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
3655
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3656
+ int32_t bsum = 0;
3657
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3658
+ const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
3659
+ const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
3660
+ int32_t sumi = 0;
3661
+ for (int l = 0; l < 4; ++l) {
3662
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
3663
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
3664
+ for (int j = 0; j < 4; ++j) {
3665
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3666
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3667
+ }
3668
+ q8 += 8;
3669
+ }
3670
+ qs += 8;
3671
+ signs += 4;
3672
+ bsum += sumi * ls1;
3673
+ sumi = 0;
3674
+ for (int l = 0; l < 4; ++l) {
3675
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
3676
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
3677
+ for (int j = 0; j < 4; ++j) {
3678
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3679
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3680
+ }
3681
+ q8 += 8;
3682
+ }
3683
+ qs += 8;
3684
+ signs += 4;
3685
+ bsum += sumi * ls2;
3686
+ }
3687
+ sumf += d * bsum;
3688
+ }
3689
+ *s = sumf;
3690
+ #endif
3691
+ }
3692
+
3693
+ #if defined(__AVX2__)
3694
+ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
3695
+ const __m256i ax = _mm256_sign_epi8(x, x);
3696
+ const __m256i sy = _mm256_sign_epi8(y, x);
3697
+ return _mm256_maddubs_epi16(ax, sy);
3698
+ }
3699
+ #endif
3700
+
3701
+ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3702
+ assert(n % QK_K == 0);
3703
+ assert(nrc == 1);
3704
+ UNUSED(nrc);
3705
+ UNUSED(bx);
3706
+ UNUSED(by);
3707
+ UNUSED(bs);
3708
+
3709
+ const block_iq1_s * GGML_RESTRICT x = vx;
3710
+ const block_q8_K * GGML_RESTRICT y = vy;
3711
+
3712
+ const int nb = n / QK_K;
3713
+
3714
+ #if defined __AVX2__
3715
+
3716
+ __m256 accum = _mm256_setzero_ps();
3717
+ float accum1 = 0;
3718
+ for (int i = 0; i < nb; ++i) {
3719
+
3720
+ const int8_t * q8 = y[i].qs;
3721
+ const uint8_t * qs = x[i].qs;
3722
+ const uint16_t * qh = x[i].qh;
3723
+
3724
+ __m256i sumi = _mm256_setzero_si256();
3725
+ int sumi1 = 0;
3726
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
3727
+ #ifdef __BMI2__
3728
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
3729
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
3730
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
3731
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
3732
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
3733
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
3734
+ #else
3735
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
3736
+ iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
3737
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
3738
+ iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
3739
+ #endif
3740
+ qs += 8;
3741
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3742
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3743
+
3744
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
3745
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
3746
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
3747
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
3748
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
3749
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
3750
+
3751
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
3752
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
3753
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
3754
+ }
3755
+
3756
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
3757
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
3758
+ accum1 += d * sumi1;
3759
+
3760
+ }
3761
+
3762
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
3763
+
3764
+ #elif defined __AVX__
3765
+ __m256 accum = _mm256_setzero_ps();
3766
+ float accum1 = 0;
3767
+ for (int i = 0; i < nb; ++i) {
3768
+
3769
+ const int8_t * q8 = y[i].qs;
3770
+ const uint8_t * qs = x[i].qs;
3771
+ const uint16_t * qh = x[i].qh;
3772
+
3773
+ __m128i sumi1_0 = _mm_setzero_si128();
3774
+ __m128i sumi1_1 = _mm_setzero_si128();
3775
+ int sumi1 = 0;
3776
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
3777
+ const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
3778
+ const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
3779
+ const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
3780
+ const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
3781
+ qs += 8;
3782
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3783
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3784
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3785
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3786
+
3787
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
3788
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
3789
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
3790
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
3791
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
3792
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
3793
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
3794
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
3795
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
3796
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
3797
+
3798
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
3799
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
3800
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
3801
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
3802
+ }
3803
+
3804
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
3805
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
3806
+ accum1 += d * sumi1;
3807
+
3808
+ }
3809
+
3810
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
3811
+
3812
+ #else
3813
+
3814
+ float sumf = 0;
3815
+ for (int i = 0; i < nb; i++) {
3816
+
3817
+ const int8_t * q8 = y[i].qs;
3818
+ const uint8_t * qs = x[i].qs;
3819
+ const uint16_t * qh = x[i].qh;
3820
+
3821
+ int sumi = 0, sumi1 = 0;
3822
+ for (int ib = 0; ib < QK_K/32; ++ib) {
3823
+ const int ls = 2*((qh[ib] >> 12) & 7) + 1;
3824
+ const int delta = qh[ib] & 0x8000 ? -1 : 1;
3825
+ int lsum = 0;
3826
+ for (int l = 0; l < 4; ++l) {
3827
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
3828
+ for (int j = 0; j < 8; ++j) {
3829
+ lsum += q8[j] * grid[j];
3830
+ }
3831
+ q8 += 8;
3832
+ }
3833
+ sumi += ls * lsum;
3834
+ sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
3835
+ qs += 4;
3836
+ }
3837
+
3838
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3839
+ }
3840
+
3841
+ *s = sumf;
3842
+
3843
+ #endif
3844
+ }
3845
+
3846
+ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3847
+ assert(n % QK_K == 0);
3848
+ assert(nrc == 1);
3849
+ UNUSED(nrc);
3850
+ UNUSED(bx);
3851
+ UNUSED(by);
3852
+ UNUSED(bs);
3853
+
3854
+ const block_iq1_m * GGML_RESTRICT x = vx;
3855
+ const block_q8_K * GGML_RESTRICT y = vy;
3856
+
3857
+ const int nb = n / QK_K;
3858
+
3859
+ iq1m_scale_t scale;
3860
+
3861
+ #if defined __AVX2__
3862
+
3863
+ const __m256i mask = _mm256_set1_epi16(0x7);
3864
+ const __m256i mone = _mm256_set1_epi16(1);
3865
+ const __m256i mone8 = _mm256_set1_epi8(1);
3866
+ const __m256i mtwo8 = _mm256_set1_epi8(2);
3867
+ // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
3868
+ const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
3869
+
3870
+ __m256 accum1 = _mm256_setzero_ps();
3871
+ __m256 accum2 = _mm256_setzero_ps();
3872
+ for (int i = 0; i < nb; ++i) {
3873
+
3874
+ const int8_t * q8 = y[i].qs;
3875
+ const uint8_t * qs = x[i].qs;
3876
+ const uint8_t * qh = x[i].qh;
3877
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
3878
+
3879
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3880
+ // Extract 3-bit scales (16 values)
3881
+ __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
3882
+ scales = _mm256_srlv_epi64(scales, scales_shift);
3883
+ scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
3884
+
3885
+ // Indices to repeat each scale 8 times.
3886
+ __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
3887
+ __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
3888
+
3889
+ __m256i sumi1 = _mm256_setzero_si256();
3890
+ __m256i sumi2 = _mm256_setzero_si256();
3891
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
3892
+ #ifdef __BMI2__
3893
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
3894
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
3895
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
3896
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
3897
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
3898
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
3899
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
3900
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
3901
+
3902
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
3903
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
3904
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
3905
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
3906
+ #else
3907
+ const __m256i q1b_1 = _mm256_set_epi64x(
3908
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
3909
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
3910
+ );
3911
+ const __m256i q1b_2 = _mm256_set_epi64x(
3912
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
3913
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
3914
+ );
3915
+
3916
+ const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3917
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
3918
+ qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3919
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3920
+ const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3921
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
3922
+ qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3923
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3924
+ #endif
3925
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3926
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
3927
+
3928
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
3929
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
3930
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
3931
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
3932
+
3933
+ __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
3934
+ __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
3935
+
3936
+ scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
3937
+ scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
3938
+
3939
+ const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
3940
+ const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
3941
+ const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
3942
+ const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
3943
+
3944
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
3945
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
3946
+
3947
+ qs += 8; qh += 4;
3948
+ }
3949
+
3950
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
3951
+
3952
+ accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
3953
+ accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
3954
+ }
3955
+
3956
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
3957
+
3958
+ #elif defined __AVX__
3959
+ const __m128i mask = _mm_set1_epi16(0x7);
3960
+ const __m128i mone = _mm_set1_epi16(1);
3961
+
3962
+ __m256 accum1 = _mm256_setzero_ps();
3963
+ __m256 accum2 = _mm256_setzero_ps();
3964
+ for (int i = 0; i < nb; ++i) {
3965
+
3966
+ const int8_t * q8 = y[i].qs;
3967
+ const uint8_t * qs = x[i].qs;
3968
+ const uint8_t * qh = x[i].qh;
3969
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
3970
+
3971
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3972
+
3973
+ __m128i sumi1_0 = _mm_setzero_si128();
3974
+ __m128i sumi1_1 = _mm_setzero_si128();
3975
+ __m128i sumi2_0 = _mm_setzero_si128();
3976
+ __m128i sumi2_1 = _mm_setzero_si128();
3977
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
3978
+ const __m128i q1b_1_0 = _mm_set_epi64x(
3979
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
3980
+ const __m128i q1b_1_1 = _mm_set_epi64x(
3981
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
3982
+ const __m128i q1b_2_0 = _mm_set_epi64x(
3983
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
3984
+ const __m128i q1b_2_1 = _mm_set_epi64x(
3985
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
3986
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3987
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3988
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3989
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
3990
+
3991
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
3992
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
3993
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
3994
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
3995
+
3996
+ const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3997
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
3998
+ const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
3999
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
4000
+ const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
4001
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
4002
+ const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
4003
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
4004
+
4005
+ const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
4006
+ const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
4007
+ const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
4008
+ const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
4009
+
4010
+ __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
4011
+ __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
4012
+ __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
4013
+ __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
4014
+
4015
+ scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
4016
+ scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
4017
+ scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
4018
+ scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
4019
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
4020
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
4021
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
4022
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
4023
+ const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
4024
+ const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
4025
+ const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
4026
+ const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
4027
+
4028
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
4029
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
4030
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
4031
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
4032
+
4033
+ qs += 8; qh += 4;
4034
+ }
4035
+
4036
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
4037
+
4038
+ accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
4039
+ accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
4040
+ }
4041
+
4042
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
4043
+
4044
+ #else
4045
+
4046
+ int sum1[2], sum2[2], delta[4];
4047
+
4048
+ float sumf = 0;
4049
+ for (int i = 0; i < nb; i++) {
4050
+
4051
+ const int8_t * q8 = y[i].qs;
4052
+ const uint8_t * qs = x[i].qs;
4053
+ const uint8_t * qh = x[i].qh;
4054
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
4055
+
4056
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
4057
+
4058
+ int sumi1 = 0, sumi2 = 0;
4059
+ for (int ib = 0; ib < QK_K/32; ++ib) {
4060
+ delta[0] = qh[0] & 0x08 ? -1 : 1;
4061
+ delta[1] = qh[0] & 0x80 ? -1 : 1;
4062
+ delta[2] = qh[1] & 0x08 ? -1 : 1;
4063
+ delta[3] = qh[1] & 0x80 ? -1 : 1;
4064
+ sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
4065
+ for (int l = 0; l < 4; ++l) {
4066
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
4067
+ int lsum1 = 0, lsum2 = 0;
4068
+ for (int j = 0; j < 8; ++j) {
4069
+ lsum1 += q8[j] * grid[j];
4070
+ lsum2 += q8[j];
4071
+ }
4072
+ q8 += 8;
4073
+ sum1[l/2] += lsum1;
4074
+ sum2[l/2] += lsum2*delta[l];
4075
+ }
4076
+
4077
+ const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
4078
+ const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
4079
+
4080
+ sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
4081
+ sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
4082
+ qs += 4;
4083
+ qh += 2;
4084
+ }
4085
+
4086
+ sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
4087
+ }
4088
+
4089
+ *s = sumf;
4090
+
4091
+ #endif
4092
+ }
4093
+
4094
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4095
+ assert(nrc == 1);
4096
+ UNUSED(nrc);
4097
+ UNUSED(bx);
4098
+ UNUSED(by);
4099
+ UNUSED(bs);
4100
+ assert(n % QK4_NL == 0);
4101
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
4102
+
4103
+ const block_iq4_nl * GGML_RESTRICT x = vx;
4104
+ const block_q8_0 * GGML_RESTRICT y = vy;
4105
+
4106
+ const int nb = n / QK4_NL;
4107
+
4108
+ int ib = 0;
4109
+ float sumf = 0;
4110
+
4111
+ #if defined __AVX2__
4112
+
4113
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
4114
+ const __m128i m4b = _mm_set1_epi8(0x0f);
4115
+ const __m256i mone = _mm256_set1_epi16(1);
4116
+
4117
+ __m256 accum1 = _mm256_setzero_ps();
4118
+ __m256 accum2 = _mm256_setzero_ps();
4119
+ for (; ib + 1 < nb; ib += 2) {
4120
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
4121
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
4122
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
4123
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
4124
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
4125
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
4126
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
4127
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
4128
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
4129
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
4130
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
4131
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
4132
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
4133
+ _mm256_cvtepi32_ps(p_1), accum1);
4134
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
4135
+ _mm256_cvtepi32_ps(p_2), accum2);
4136
+ }
4137
+
4138
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
4139
+
4140
+ #elif defined __AVX__
4141
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
4142
+ const __m128i m4b = _mm_set1_epi8(0x0f);
4143
+
4144
+ __m256 accum = _mm256_setzero_ps();
4145
+ for (; ib + 1 < nb; ib += 2) {
4146
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
4147
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
4148
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
4149
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
4150
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
4151
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
4152
+
4153
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
4154
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
4155
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
4156
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
4157
+
4158
+ const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
4159
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
4160
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
4161
+ }
4162
+
4163
+ sumf = hsum_float_8(accum);
4164
+
4165
+ #endif
4166
+ for (; ib < nb; ++ib) {
4167
+ const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
4168
+ int sumi1 = 0, sumi2 = 0;
4169
+ for (int j = 0; j < QK4_NL/2; ++j) {
4170
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
4171
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
4172
+ }
4173
+ sumf += d * (sumi1 + sumi2);
4174
+ }
4175
+ *s = sumf;
4176
+ }
4177
+
4178
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4179
+ assert(nrc == 1);
4180
+ UNUSED(nrc);
4181
+ UNUSED(bx);
4182
+ UNUSED(by);
4183
+ UNUSED(bs);
4184
+ assert(n % QK_K == 0);
4185
+
4186
+ const block_iq4_xs * GGML_RESTRICT x = vx;
4187
+ const block_q8_K * GGML_RESTRICT y = vy;
4188
+
4189
+ const int nb = n / QK_K;
4190
+
4191
+ #if defined __AVX2__
4192
+
4193
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
4194
+ const __m128i m4b = _mm_set1_epi8(0x0f);
4195
+
4196
+ __m256 accum = _mm256_setzero_ps();
4197
+ for (int ibl = 0; ibl < nb; ++ibl) {
4198
+ const uint8_t * qs = x[ibl].qs;
4199
+ const int8_t * q8 = y[ibl].qs;
4200
+ uint16_t sh = x[ibl].scales_h;
4201
+ __m256i sumi1 = _mm256_setzero_si256();
4202
+ __m256i sumi2 = _mm256_setzero_si256();
4203
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
4204
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
4205
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
4206
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
4207
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
4208
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
4209
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
4210
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
4211
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
4212
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
4213
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
4214
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
4215
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
4216
+ sh >>= 4;
4217
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
4218
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
4219
+ sumi1 = _mm256_add_epi32(p_1, sumi1);
4220
+ sumi2 = _mm256_add_epi32(p_2, sumi2);
4221
+ }
4222
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
4223
+ _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
4224
+ }
4225
+
4226
+ *s = hsum_float_8(accum);
4227
+
4228
+ #elif defined __AVX__
4229
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
4230
+ const __m128i m4b = _mm_set1_epi8(0x0f);
4231
+
4232
+ __m256 accum = _mm256_setzero_ps();
4233
+ for (int ibl = 0; ibl < nb; ++ibl) {
4234
+ const uint8_t * qs = x[ibl].qs;
4235
+ const int8_t * q8 = y[ibl].qs;
4236
+ uint16_t sh = x[ibl].scales_h;
4237
+ __m128i sumi1_0 = _mm_setzero_si128();
4238
+ __m128i sumi1_1 = _mm_setzero_si128();
4239
+ __m128i sumi2_0 = _mm_setzero_si128();
4240
+ __m128i sumi2_1 = _mm_setzero_si128();
4241
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
4242
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
4243
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
4244
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
4245
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
4246
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
4247
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
4248
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
4249
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
4250
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
4251
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
4252
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
4253
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
4254
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
4255
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
4256
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
4257
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
4258
+ sh >>= 4;
4259
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
4260
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
4261
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
4262
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
4263
+ sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
4264
+ sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
4265
+ sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
4266
+ sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
4267
+ }
4268
+ __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
4269
+ __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
4270
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
4271
+ _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
4272
+ }
4273
+
4274
+ *s = hsum_float_8(accum);
4275
+
4276
+ #else
4277
+ float sumf = 0;
4278
+ for (int ibl = 0; ibl < nb; ++ibl) {
4279
+ const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4280
+ uint16_t h = x[ibl].scales_h;
4281
+ const uint8_t * qs = x[ibl].qs;
4282
+ const int8_t * q8 = y[ibl].qs;
4283
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
4284
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
4285
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
4286
+ h >>= 4;
4287
+ const float d1 = d4d8*(ls1 - 32);
4288
+ const float d2 = d4d8*(ls2 - 32);
4289
+ int sumi1 = 0, sumi2 = 0;
4290
+ for (int j = 0; j < 16; ++j) {
4291
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4292
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4293
+ }
4294
+ sumf += d1 * (sumi1 + sumi2);
4295
+ qs += 16;
4296
+ q8 += 32;
4297
+ sumi1 = sumi2 = 0;
4298
+ for (int j = 0; j < 16; ++j) {
4299
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4300
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4301
+ }
4302
+ sumf += d2 * (sumi1 + sumi2);
4303
+ qs += 16;
4304
+ q8 += 32;
4305
+ }
4306
+ }
4307
+ *s = sumf;
4308
+ #endif
4309
+ }
4310
+