llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,12 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+
1
4
  #include "ggml-quants.h"
2
5
  #include "ggml-impl.h"
3
6
 
7
+ #define GGML_COMMON_IMPL_C
8
+ #include "ggml-common.h"
9
+
4
10
  #include <math.h>
5
11
  #include <string.h>
6
12
  #include <assert.h>
@@ -51,6 +57,7 @@
51
57
 
52
58
  #define UNUSED GGML_UNUSED
53
59
 
60
+ // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
54
61
  #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
55
62
 
56
63
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@@ -463,8 +470,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
463
470
  }
464
471
 
465
472
  // NOTE: not tested
466
- inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
467
- int8x16_t res;
473
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
474
+ uint8x16_t res;
468
475
 
469
476
  res[ 0] = a[b[ 0]];
470
477
  res[ 1] = a[b[ 1]];
@@ -947,7 +954,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
947
954
  const float d = amax / ((1 << 7) - 1);
948
955
  const float id = d ? 1.0f/d : 0.0f;
949
956
 
950
- y[i].d = d;
957
+ y[i].d = GGML_FP32_TO_FP16(d);
951
958
 
952
959
  int sum = 0;
953
960
 
@@ -962,7 +969,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
962
969
  sum += y[i].qs[QK8_1/2 + j];
963
970
  }
964
971
 
965
- y[i].s = sum*d;
972
+ y[i].s = GGML_FP32_TO_FP16(sum*d);
966
973
  }
967
974
  }
968
975
 
@@ -990,7 +997,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
990
997
  const float d = amax / ((1 << 7) - 1);
991
998
  const float id = d ? 1.0f/d : 0.0f;
992
999
 
993
- y[i].d = d;
1000
+ y[i].d = GGML_FP32_TO_FP16(d);
994
1001
 
995
1002
  int32x4_t accv = vdupq_n_s32(0);
996
1003
 
@@ -1006,7 +1013,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1006
1013
  accv = vaddq_s32(accv, vi);
1007
1014
  }
1008
1015
 
1009
- y[i].s = d * vaddvq_s32(accv);
1016
+ y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
1010
1017
  }
1011
1018
  #elif defined(__wasm_simd128__)
1012
1019
  for (int i = 0; i < nb; i++) {
@@ -1029,7 +1036,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1029
1036
  const float d = amax / ((1 << 7) - 1);
1030
1037
  const float id = d ? 1.0f/d : 0.0f;
1031
1038
 
1032
- y[i].d = d;
1039
+ y[i].d = GGML_FP32_TO_FP16(d);
1033
1040
 
1034
1041
  v128_t accv = wasm_i32x4_splat(0);
1035
1042
 
@@ -1045,10 +1052,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1045
1052
  accv = wasm_i32x4_add(accv, vi);
1046
1053
  }
1047
1054
 
1048
- y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
1049
- wasm_i32x4_extract_lane(accv, 1) +
1050
- wasm_i32x4_extract_lane(accv, 2) +
1051
- wasm_i32x4_extract_lane(accv, 3));
1055
+ y[i].s = GGML_FP32_TO_FP16(
1056
+ d * (wasm_i32x4_extract_lane(accv, 0) +
1057
+ wasm_i32x4_extract_lane(accv, 1) +
1058
+ wasm_i32x4_extract_lane(accv, 2) +
1059
+ wasm_i32x4_extract_lane(accv, 3)));
1052
1060
  }
1053
1061
  #elif defined(__AVX2__) || defined(__AVX__)
1054
1062
  for (int i = 0; i < nb; i++) {
@@ -1073,7 +1081,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1073
1081
 
1074
1082
  // Quantize these floats
1075
1083
  const float d = maxScalar / 127.f;
1076
- y[i].d = d;
1084
+ y[i].d = GGML_FP32_TO_FP16(d);
1077
1085
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1078
1086
  const __m256 mul = _mm256_set1_ps( id );
1079
1087
 
@@ -1097,7 +1105,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1097
1105
 
1098
1106
  #if defined(__AVX2__)
1099
1107
  // Compute the sum of the quants and set y[i].s
1100
- y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
1108
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
1101
1109
 
1102
1110
  // Convert int32 to int16
1103
1111
  i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
@@ -1127,7 +1135,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1127
1135
  // Compute the sum of the quants and set y[i].s
1128
1136
  const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
1129
1137
  const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
1130
- y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
1138
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
1131
1139
 
1132
1140
  // Convert int32 to int16
1133
1141
  ni0 = _mm_packs_epi32( ni0, ni1 );
@@ -1158,7 +1166,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1158
1166
  const float d = amax / ((1 << 7) - 1);
1159
1167
  const float id = d ? 1.0f/d : 0.0f;
1160
1168
 
1161
- y[i].d = d;
1169
+ y[i].d = GGML_FP32_TO_FP16(d);
1162
1170
 
1163
1171
  vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1164
1172
 
@@ -1175,7 +1183,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1175
1183
 
1176
1184
  // set y[i].s
1177
1185
  int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
1178
- y[i].s = sum*d;
1186
+ y[i].s = GGML_FP32_TO_FP16(sum*d);
1179
1187
  }
1180
1188
  #else
1181
1189
  GGML_UNUSED(nb);
@@ -1700,16 +1708,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1700
1708
  quantize_row_q2_K_reference(x, vy, k);
1701
1709
  }
1702
1710
 
1703
- size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1704
- (void)hist; // TODO: collect histograms
1705
-
1706
- for (int j = 0; j < n; j += k) {
1707
- block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
1708
- quantize_row_q2_K_reference(src + j, y, k);
1709
- }
1710
- return (n/QK_K*sizeof(block_q2_K));
1711
- }
1712
-
1713
1711
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1714
1712
  uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1715
1713
  float rmin, float rdelta, int nstep, bool use_mad) {
@@ -1962,8 +1960,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1962
1960
  }
1963
1961
  }
1964
1962
 
1965
- size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1966
- (void)hist;
1963
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
1967
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1968
1965
  if (!quant_weights) {
1969
1966
  quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
@@ -2182,16 +2179,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2182
2179
  quantize_row_q3_K_reference(x, vy, k);
2183
2180
  }
2184
2181
 
2185
- size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2186
- (void)hist; // TODO: collect histograms
2187
-
2188
- for (int j = 0; j < n; j += k) {
2189
- block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
2190
- quantize_row_q3_K_reference(src + j, y, k);
2191
- }
2192
- return (n/QK_K*sizeof(block_q3_K));
2193
- }
2194
-
2195
2182
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2196
2183
  #if QK_K != 256
2197
2184
  (void)quant_weights;
@@ -2281,8 +2268,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2281
2268
  #endif
2282
2269
  }
2283
2270
 
2284
- size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2285
- (void)hist;
2271
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2286
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2287
2273
  if (!quant_weights) {
2288
2274
  quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
@@ -2452,17 +2438,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2452
2438
  quantize_row_q4_K_reference(x, y, k);
2453
2439
  }
2454
2440
 
2455
- size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2456
- assert(k % QK_K == 0);
2457
- (void)hist; // TODO: collect histograms
2458
-
2459
- for (int j = 0; j < n; j += k) {
2460
- block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
2461
- quantize_row_q4_K_reference(src + j, y, k);
2462
- }
2463
- return (n/QK_K*sizeof(block_q4_K));
2464
- }
2465
-
2466
2441
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2467
2442
  #if QK_K != 256
2468
2443
  (void)quant_weights;
@@ -2541,8 +2516,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2541
2516
  #endif
2542
2517
  }
2543
2518
 
2544
- size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2545
- (void)hist;
2519
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2546
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2547
2521
  if (!quant_weights) {
2548
2522
  quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
@@ -2753,17 +2727,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2753
2727
  quantize_row_q5_K_reference(x, y, k);
2754
2728
  }
2755
2729
 
2756
- size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2757
- assert(k % QK_K == 0);
2758
- (void)hist; // TODO: collect histograms
2759
-
2760
- for (int j = 0; j < n; j += k) {
2761
- block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
2762
- quantize_row_q5_K_reference(src + j, y, k);
2763
- }
2764
- return (n/QK_K*sizeof(block_q5_K));
2765
- }
2766
-
2767
2730
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2768
2731
  #if QK_K != 256
2769
2732
  (void)quant_weights;
@@ -2862,8 +2825,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2862
2825
  #endif
2863
2826
  }
2864
2827
 
2865
- size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2866
- (void)hist;
2828
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2867
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2868
2830
  if (!quant_weights) {
2869
2831
  quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
@@ -3016,17 +2978,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
3016
2978
  quantize_row_q6_K_reference(x, y, k);
3017
2979
  }
3018
2980
 
3019
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
3020
- assert(k % QK_K == 0);
3021
- (void)hist; // TODO: collect histograms
3022
-
3023
- for (int j = 0; j < n; j += k) {
3024
- block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
3025
- quantize_row_q6_K_reference(src + j, y, k);
3026
- }
3027
- return (n/QK_K*sizeof(block_q6_K));
3028
- }
3029
-
3030
2981
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
3031
2982
  #if QK_K != 256
3032
2983
  (void)quant_weights;
@@ -3116,8 +3067,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3116
3067
  #endif
3117
3068
  }
3118
3069
 
3119
- size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3120
- (void)hist;
3070
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3121
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3122
3072
  if (!quant_weights) {
3123
3073
  quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
@@ -3161,9 +3111,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3161
3111
  }
3162
3112
  }
3163
3113
 
3164
- size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3114
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3165
3115
  if (!quant_weights) {
3166
- return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3116
+ quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3117
+ return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3167
3118
  }
3168
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3169
3120
  char * qrow = (char *)dst;
@@ -3205,9 +3156,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3205
3156
  }
3206
3157
  }
3207
3158
 
3208
- size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3159
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3209
3160
  if (!quant_weights) {
3210
- return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3161
+ quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3162
+ return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3211
3163
  }
3212
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3213
3165
  char * qrow = (char *)dst;
@@ -3258,9 +3210,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3258
3210
  }
3259
3211
  }
3260
3212
 
3261
- size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3213
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3262
3214
  if (!quant_weights) {
3263
- return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3215
+ quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3216
+ return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3264
3217
  }
3265
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3266
3219
  char * qrow = (char *)dst;
@@ -3310,9 +3263,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3310
3263
  }
3311
3264
  }
3312
3265
 
3313
- size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3266
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3314
3267
  if (!quant_weights) {
3315
- return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3268
+ quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3269
+ return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3316
3270
  }
3317
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3318
3272
  char * qrow = (char *)dst;
@@ -3324,712 +3278,14 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
3324
3278
  return nrow * row_size;
3325
3279
  }
3326
3280
 
3327
- // ====================== "True" 2-bit (de)-quantization
3328
-
3329
- static const uint64_t iq2xxs_grid[256] = {
3330
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3331
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
3332
- 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
3333
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
3334
- 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
3335
- 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
3336
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
3337
- 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
3338
- 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
3339
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
3340
- 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
3341
- 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
3342
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
3343
- 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
3344
- 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
3345
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
3346
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
3347
- 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
3348
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
3349
- 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
3350
- 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
3351
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
3352
- 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
3353
- 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
3354
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
3355
- 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
3356
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
3357
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
3358
- 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
3359
- 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
3360
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
3361
- 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
3362
- 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
3363
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
3364
- 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
3365
- 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
3366
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
3367
- 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
3368
- 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
3369
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
3370
- 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
3371
- 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
3372
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
3373
- 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
3374
- 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
3375
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
3376
- 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
3377
- 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
3378
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
3379
- 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
3380
- 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
3381
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
3382
- 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
3383
- 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
3384
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
3385
- 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
3386
- 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
3387
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
3388
- 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
3389
- 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
3390
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
3391
- 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
3392
- 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
3393
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
3394
- };
3395
-
3396
- static const uint64_t iq2xs_grid[512] = {
3397
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3398
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3399
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3400
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3401
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3402
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
3403
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
3404
- 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
3405
- 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
3406
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
3407
- 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
3408
- 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
3409
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
3410
- 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
3411
- 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
3412
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
3413
- 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
3414
- 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
3415
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
3416
- 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
3417
- 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
3418
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
3419
- 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
3420
- 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
3421
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
3422
- 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
3423
- 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
3424
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
3425
- 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
3426
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
3427
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
3428
- 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
3429
- 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
3430
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
3431
- 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
3432
- 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
3433
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
3434
- 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
3435
- 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
3436
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
3437
- 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
3438
- 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
3439
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
3440
- 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
3441
- 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
3442
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
3443
- 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
3444
- 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
3445
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
3446
- 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
3447
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
3448
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
3449
- 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
3450
- 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
3451
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
3452
- 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
3453
- 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
3454
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
3455
- 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
3456
- 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
3457
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
3458
- 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
3459
- 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
3460
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
3461
- 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
3462
- 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
3463
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
3464
- 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
3465
- 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
3466
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
3467
- 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
3468
- 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
3469
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
3470
- 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
3471
- 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
3472
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
3473
- 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
3474
- 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
3475
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
3476
- 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
3477
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
3478
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
3479
- 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
3480
- 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
3481
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
3482
- 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
3483
- 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
3484
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
3485
- 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
3486
- 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
3487
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
3488
- 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
3489
- 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
3490
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
3491
- 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
3492
- 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
3493
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
3494
- 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
3495
- 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
3496
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
3497
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
3498
- 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
3499
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
3500
- 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
3501
- 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
3502
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
3503
- 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
3504
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
3505
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
3506
- 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
3507
- 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
3508
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
3509
- 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
3510
- 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
3511
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
3512
- 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
3513
- 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
3514
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
3515
- 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
3516
- 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
3517
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
3518
- 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
3519
- 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
3520
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
3521
- 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
3522
- 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
3523
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
3524
- 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3525
- };
3526
-
3527
- static const uint64_t iq2s_grid[1024] = {
3528
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3529
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3530
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3531
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3532
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3533
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3534
- 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3535
- 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3536
- 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3537
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3538
- 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3539
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3540
- 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3541
- 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3542
- 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3543
- 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3544
- 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3545
- 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3546
- 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3547
- 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3548
- 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3549
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3550
- 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3551
- 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3552
- 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3553
- 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3554
- 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3555
- 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3556
- 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3557
- 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3558
- 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3559
- 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3560
- 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3561
- 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3562
- 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3563
- 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3564
- 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3565
- 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3566
- 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3567
- 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3568
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3569
- 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3570
- 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3571
- 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3572
- 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3573
- 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3574
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3575
- 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3576
- 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3577
- 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3578
- 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3579
- 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3580
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3581
- 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3582
- 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3583
- 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3584
- 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3585
- 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3586
- 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3587
- 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3588
- 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3589
- 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3590
- 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3591
- 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3592
- 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3593
- 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3594
- 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3595
- 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3596
- 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3597
- 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3598
- 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3599
- 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3600
- 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3601
- 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3602
- 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3603
- 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3604
- 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3605
- 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3606
- 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3607
- 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3608
- 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3609
- 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3610
- 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3611
- 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3612
- 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3613
- 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3614
- 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3615
- 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3616
- 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3617
- 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3618
- 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3619
- 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3620
- 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3621
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3622
- 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3623
- 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3624
- 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3625
- 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3626
- 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3627
- 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3628
- 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3629
- 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3630
- 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3631
- 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3632
- 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3633
- 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3634
- 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3635
- 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3636
- 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3637
- 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3638
- 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3639
- 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3640
- 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3641
- 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3642
- 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3643
- 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3644
- 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3645
- 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3646
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3647
- 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3648
- 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3649
- 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3650
- 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3651
- 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3652
- 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3653
- 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3654
- 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3655
- 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3656
- 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3657
- 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3658
- 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3659
- 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3660
- 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3661
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3662
- 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3663
- 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3664
- 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3665
- 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3666
- 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3667
- 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3668
- 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3669
- 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3670
- 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3671
- 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3672
- 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3673
- 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3674
- 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3675
- 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3676
- 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3677
- 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3678
- 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3679
- 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3680
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3681
- 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3682
- 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3683
- 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3684
- 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3685
- 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3686
- 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3687
- 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3688
- 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3689
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3690
- 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3691
- 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3692
- 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3693
- 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3694
- 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3695
- 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3696
- 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3697
- 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3698
- 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3699
- 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3700
- 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3701
- 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3702
- 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3703
- 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3704
- 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3705
- 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3706
- 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3707
- 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3708
- 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3709
- 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3710
- 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3711
- 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3712
- 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3713
- 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3714
- 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3715
- 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3716
- 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3717
- 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3718
- 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3719
- 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3720
- 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3721
- 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3722
- 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3723
- 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3724
- 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3725
- 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3726
- 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3727
- 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3728
- 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3729
- 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3730
- 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3731
- 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3732
- 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3733
- 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3734
- 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3735
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3736
- 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3737
- 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3738
- 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3739
- 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3740
- 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3741
- 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
3742
- 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
3743
- 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
3744
- 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
3745
- 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
3746
- 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
3747
- 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
3748
- 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
3749
- 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
3750
- 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
3751
- 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
3752
- 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
3753
- 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
3754
- 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
3755
- 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
3756
- 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
3757
- 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
3758
- 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
3759
- 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
3760
- 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
3761
- 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
3762
- 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
3763
- 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
3764
- 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
3765
- 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
3766
- 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
3767
- 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
3768
- 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
3769
- 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
3770
- 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
3771
- 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
3772
- 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
3773
- 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
3774
- 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
3775
- 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
3776
- 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
3777
- 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
3778
- 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
3779
- 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
3780
- 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
3781
- 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
3782
- 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
3783
- 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
3784
- };
3785
-
3786
- static const uint32_t iq3xxs_grid[256] = {
3787
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3788
- 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
3789
- 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
3790
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
3791
- 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
3792
- 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
3793
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
3794
- 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
3795
- 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
3796
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
3797
- 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
3798
- 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
3799
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
3800
- 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
3801
- 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
3802
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
3803
- 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
3804
- 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
3805
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
3806
- 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
3807
- 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
3808
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
3809
- 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
3810
- 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
3811
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
3812
- 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
3813
- 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
3814
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
3815
- 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
3816
- 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
3817
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
3818
- 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3819
- };
3820
-
3821
- static const uint32_t iq3xs_grid[512] = {
3822
- 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
3823
- 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
3824
- 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
3825
- 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
3826
- 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
3827
- 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
3828
- 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
3829
- 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
3830
- 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
3831
- 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
3832
- 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
3833
- 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
3834
- 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
3835
- 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
3836
- 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
3837
- 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
3838
- 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
3839
- 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
3840
- 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
3841
- 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
3842
- 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
3843
- 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
3844
- 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
3845
- 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
3846
- 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
3847
- 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
3848
- 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
3849
- 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
3850
- 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
3851
- 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
3852
- 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
3853
- 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
3854
- 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
3855
- 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
3856
- 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
3857
- 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
3858
- 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
3859
- 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
3860
- 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
3861
- 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
3862
- 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
3863
- 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
3864
- 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
3865
- 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
3866
- 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
3867
- 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
3868
- 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
3869
- 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
3870
- 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
3871
- 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
3872
- 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
3873
- 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
3874
- 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
3875
- 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
3876
- 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
3877
- 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
3878
- 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
3879
- 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
3880
- 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
3881
- 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
3882
- 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
3883
- 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
3884
- 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
3885
- 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
3886
- };
3887
-
3888
- #define NGRID_IQ2XXS 512
3889
- static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
3890
- 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
3891
- 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
3892
- 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
3893
- 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
3894
- 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
3895
- 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
3896
- 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
3897
- 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
3898
- 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
3899
- 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
3900
- 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
3901
- 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
3902
- 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
3903
- 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
3904
- 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
3905
- 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
3906
- 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
3907
- 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
3908
- 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
3909
- 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
3910
- 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
3911
- 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
3912
- 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
3913
- 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
3914
- 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
3915
- 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
3916
- 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
3917
- 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
3918
- 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
3919
- 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
3920
- 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
3921
- 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
3922
- 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
3923
- 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
3924
- 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
3925
- 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
3926
- 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
3927
- 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
3928
- 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
3929
- 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
3930
- 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
3931
- 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
3932
- 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
3933
- 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
3934
- 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
3935
- 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
3936
- 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
3937
- 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
3938
- 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
3939
- 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
3940
- 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
3941
- 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
3942
- 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
3943
- 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
3944
- 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
3945
- 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
3946
- 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
3947
- 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
3948
- 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
3949
- 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
3950
- 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
3951
- 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
3952
- 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
3953
- 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
3954
- 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
3955
- 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
3956
- 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
3957
- 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
3958
- 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
3959
- 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
3960
- 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
3961
- 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
3962
- 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
3963
- 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
3964
- 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
3965
- 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
3966
- 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
3967
- 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
3968
- 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
3969
- 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
3970
- 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
3971
- 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
3972
- 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
3973
- 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
3974
- 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
3975
- 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
3976
- 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
3977
- 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
3978
- 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
3979
- 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
3980
- 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
3981
- 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
3982
- 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
3983
- 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
3984
- 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
3985
- 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
3986
- 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
3987
- 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
3988
- 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
3989
- 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
3990
- 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
3991
- 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
3992
- 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
3993
- 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
3994
- 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
3995
- 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
3996
- 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
3997
- 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
3998
- 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
3999
- 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
4000
- 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
4001
- 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
4002
- 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
4003
- 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
4004
- 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
4005
- 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
4006
- 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
4007
- 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
4008
- 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
4009
- 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
4010
- 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
4011
- 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
4012
- 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
4013
- 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
4014
- 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
4015
- 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
4016
- 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
4017
- 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
4018
-
4019
- };
4020
-
4021
- static const uint8_t ksigns_iq2xs[128] = {
4022
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
4023
- 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
4024
- 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
4025
- 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
4026
- 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
4027
- 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
4028
- 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
4029
- 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
4030
- };
3281
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3282
+ (void)quant_weights; // not used
3283
+ const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
+ quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3285
+ return nrow * row_size;
3286
+ }
4031
3287
 
4032
- static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3288
+ // ====================== "True" 2-bit (de)-quantization
4033
3289
 
4034
3290
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
4035
3291
  assert(k % QK_K == 0);
@@ -4162,11 +3418,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
4162
3418
  const uint8_t * signs = x[i].signs;
4163
3419
 
4164
3420
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
4165
- const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
4166
- const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f;
3421
+ const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
3422
+ const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4));
4167
3423
  for (int l = 0; l < 4; ++l) {
4168
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
4169
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
3424
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
3425
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
4170
3426
  for (int j = 0; j < 4; ++j) {
4171
3427
  y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4172
3428
  y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -4176,8 +3432,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
4176
3432
  qs += 8;
4177
3433
  signs += 4;
4178
3434
  for (int l = 0; l < 4; ++l) {
4179
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
4180
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
3435
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
3436
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
4181
3437
  for (int j = 0; j < 4; ++j) {
4182
3438
  y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
4183
3439
  y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -4197,39 +3453,23 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
4197
3453
  assert(k % QK_K == 0);
4198
3454
  const int nb = k / QK_K;
4199
3455
 
4200
- float db[4];
4201
- uint16_t idx[4];
4202
- //const int8_t * grid[4];
4203
-
4204
3456
  for (int i = 0; i < nb; i++) {
4205
3457
 
4206
3458
  const float d = GGML_FP16_TO_FP32(x[i].d);
4207
- const uint8_t * sc = x[i].scales;
4208
- const uint8_t * qs = x[i].qs;
3459
+ const uint8_t * qs = x[i].qs;
3460
+ const uint16_t * qh = x[i].qh;
4209
3461
 
4210
- for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
4211
- idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
4212
- idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
4213
- idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
4214
- idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
4215
- //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
4216
- //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
4217
- //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
4218
- //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
4219
- db[0] = d * (2*(sc[0] & 7) + 1);
4220
- db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
4221
- db[2] = d * (2*(sc[1] & 7) + 1);
4222
- db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
3462
+ for (int ib = 0; ib < QK_K/32; ++ib) {
3463
+ const float dl = d * (2*((qh[ib] >> 12) & 7) + 1);
3464
+ const float delta = qh[ib] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
4223
3465
  for (int l = 0; l < 4; ++l) {
4224
- const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3466
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
4225
3467
  for (int j = 0; j < 8; ++j) {
4226
- //y[j] = db[l] * grid[l][j];
4227
- y[j] = db[l] * grid[j];
3468
+ y[j] = dl * (grid[j] + delta);
4228
3469
  }
4229
3470
  y += 8;
4230
3471
  }
4231
3472
  qs += 4;
4232
- sc += 2;
4233
3473
  }
4234
3474
  }
4235
3475
  }
@@ -4783,10 +4023,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4783
4023
  const block_q8_1 * restrict b_y0 = &vy0[i];
4784
4024
  const block_q8_1 * restrict b_y1 = &vy1[i];
4785
4025
 
4786
- float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
4787
- GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
4788
- GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
4789
- GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
4026
+ float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
4027
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
4028
+ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
4029
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
4790
4030
  summs0 += summs_t;
4791
4031
 
4792
4032
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
@@ -4807,10 +4047,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4807
4047
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4808
4048
 
4809
4049
  // mmla into int32x4_t
4810
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4811
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4812
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4813
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4050
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
4051
+ GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
4052
+ GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
4053
+ GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
4814
4054
 
4815
4055
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4816
4056
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -4851,7 +4091,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4851
4091
  const block_q8_1 * restrict y0 = &y[i + 0];
4852
4092
  const block_q8_1 * restrict y1 = &y[i + 1];
4853
4093
 
4854
- summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
4094
+ summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
4855
4095
 
4856
4096
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
4857
4097
 
@@ -4874,8 +4114,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4874
4114
  const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
4875
4115
  const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
4876
4116
 
4877
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
4878
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
4117
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
4118
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
4879
4119
  }
4880
4120
 
4881
4121
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@@ -4888,9 +4128,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4888
4128
  // Main loop
4889
4129
  for (int i = 0; i < nb; ++i) {
4890
4130
  const float d0 = GGML_FP16_TO_FP32(x[i].d);
4891
- const float d1 = y[i].d;
4131
+ const float d1 = GGML_FP16_TO_FP32(y[i].d);
4892
4132
 
4893
- summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4133
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
4894
4134
 
4895
4135
  const __m256 d0v = _mm256_set1_ps( d0 );
4896
4136
  const __m256 d1v = _mm256_set1_ps( d1 );
@@ -4942,7 +4182,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4942
4182
 
4943
4183
  int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
4944
4184
 
4945
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4185
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
4946
4186
  }
4947
4187
 
4948
4188
  *s = sumf;
@@ -4960,7 +4200,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4960
4200
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
4961
4201
  }
4962
4202
 
4963
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4203
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
4964
4204
  }
4965
4205
 
4966
4206
  *s = sumf;
@@ -5296,8 +4536,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5296
4536
 
5297
4537
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
5298
4538
 
5299
- summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
5300
- summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
4539
+ summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
4540
+ summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
5301
4541
 
5302
4542
  // extract the 5th bit via lookup table ((b) << 4)
5303
4543
  memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -5341,10 +4581,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5341
4581
 
5342
4582
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
5343
4583
  ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
5344
- ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
4584
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
5345
4585
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
5346
4586
  ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
5347
- ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
4587
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
5348
4588
  }
5349
4589
 
5350
4590
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@@ -5361,7 +4601,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5361
4601
  const block_q5_1 * restrict x0 = &x[i];
5362
4602
  const block_q8_1 * restrict y0 = &y[i];
5363
4603
 
5364
- summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
4604
+ summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
5365
4605
 
5366
4606
  const v128_t m4b = wasm_i8x16_splat(0x0F);
5367
4607
 
@@ -5408,7 +4648,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5408
4648
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
5409
4649
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
5410
4650
  wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
5411
- wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
4651
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
5412
4652
  }
5413
4653
 
5414
4654
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -5423,14 +4663,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5423
4663
  for (int i = 0; i < nb; i++) {
5424
4664
  const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
5425
4665
 
5426
- summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4666
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
5427
4667
 
5428
4668
  __m256i qx = bytes_from_nibbles_32(x[i].qs);
5429
4669
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
5430
4670
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
5431
4671
  qx = _mm256_or_si256(qx, bxhi);
5432
4672
 
5433
- const __m256 dy = _mm256_set1_ps(y[i].d);
4673
+ const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
5434
4674
  const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
5435
4675
 
5436
4676
  const __m256 q = mul_sum_us8_pairs_float(qx, qy);
@@ -5450,7 +4690,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5450
4690
  for (int i = 0; i < nb; i++) {
5451
4691
  const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
5452
4692
 
5453
- summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4693
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
5454
4694
 
5455
4695
  __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
5456
4696
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -5464,7 +4704,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5464
4704
  bxh = _mm_or_si128(bxh, bxhih);
5465
4705
  bx_0 = MM256_SET_M128I(bxh, bxl);
5466
4706
 
5467
- const __m256 dy = _mm256_set1_ps(y[i].d);
4707
+ const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
5468
4708
  const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
5469
4709
 
5470
4710
  const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
@@ -5531,7 +4771,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5531
4771
 
5532
4772
  int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
5533
4773
 
5534
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4774
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
5535
4775
  }
5536
4776
 
5537
4777
  *s = sumf;
@@ -5555,7 +4795,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5555
4795
  sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
5556
4796
  }
5557
4797
 
5558
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4798
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
5559
4799
  }
5560
4800
 
5561
4801
  *s = sumf;
@@ -9563,7 +8803,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9563
8803
 
9564
8804
  const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
9565
8805
  const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
9566
- const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
8806
+ const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
9567
8807
 
9568
8808
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
9569
8809
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
@@ -9585,8 +8825,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9585
8825
  const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
9586
8826
  const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
9587
8827
 
9588
- const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
9589
- const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
8828
+ const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
8829
+ const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
9590
8830
 
9591
8831
  const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
9592
8832
 
@@ -9653,8 +8893,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9653
8893
 
9654
8894
  const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
9655
8895
  const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
9656
- const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
9657
- const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
8896
+ const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
8897
+ const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
9658
8898
 
9659
8899
  __m256i signs;
9660
8900
  signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@@ -9757,8 +8997,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9757
8997
 
9758
8998
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9759
8999
 
9760
- const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
9761
- const uint8x16_t mask2 = vld1q_u8(k_mask2);
9000
+ const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
9001
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9762
9002
  const uint8x16_t m1 = vdupq_n_u8(1);
9763
9003
  const int32x4_t vzero = vdupq_n_s32(0);
9764
9004
 
@@ -9789,7 +9029,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9789
9029
  vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
9790
9030
  qs += 8;
9791
9031
 
9792
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9032
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
9793
9033
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9794
9034
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9795
9035
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
@@ -9798,7 +9038,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9798
9038
  q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
9799
9039
  q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
9800
9040
 
9801
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9041
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
9802
9042
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9803
9043
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9804
9044
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
@@ -9869,12 +9109,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9869
9109
  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9870
9110
  qs += 8;
9871
9111
 
9872
- __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
9112
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
9873
9113
  aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9874
9114
  const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
9875
9115
  const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
9876
9116
 
9877
- aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
9117
+ aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
9878
9118
  aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9879
9119
  const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
9880
9120
  const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
@@ -10074,7 +9314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10074
9314
  #endif
10075
9315
  }
10076
9316
 
10077
- void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9317
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10078
9318
  assert(n % QK_K == 0);
10079
9319
  assert(nrc == 1);
10080
9320
  UNUSED(nrc);
@@ -10089,18 +9329,35 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10089
9329
 
10090
9330
  #if defined(__ARM_NEON)
10091
9331
 
9332
+ typedef union {
9333
+ uint16x8_t vec_index;
9334
+ uint16_t index[8];
9335
+ } vec_index_t;
9336
+
10092
9337
  static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10093
9338
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10094
9339
  };
10095
9340
 
10096
9341
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10097
9342
 
10098
- const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10099
- const uint8x16_t mask2 = vld1q_u8(k_mask2);
9343
+ static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
9344
+
9345
+ const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
9346
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9347
+
9348
+ const int16x8_t hshift = vld1q_s16(k_shift);
9349
+ const uint16x8_t m256 = vdupq_n_u16(256);
9350
+ const uint8x16_t m1 = vdupq_n_u8(1);
10100
9351
 
10101
9352
  uint8x16x2_t vs;
10102
9353
  ggml_int8x16x4_t q3s;
10103
9354
  ggml_int8x16x4_t q8b;
9355
+ vec_index_t idx;
9356
+
9357
+ #if QK_K == 256
9358
+ uint32_t scales32[2];
9359
+ const uint8_t * scales8 = (const uint8_t *)scales32;
9360
+ #endif
10104
9361
 
10105
9362
  float sumf = 0;
10106
9363
  for (int i = 0; i < nb; ++i) {
@@ -10109,47 +9366,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10109
9366
  const uint8_t * restrict qh = x[i].qh;
10110
9367
  const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10111
9368
  const int8_t * restrict q8 = y[i].qs;
9369
+
9370
+ #if QK_K == 256
9371
+ memcpy(scales32, x[i].scales, 4);
9372
+ scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
9373
+ scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
9374
+ #endif
9375
+
10112
9376
  int sumi1 = 0, sumi2 = 0;
10113
9377
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10114
9378
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10115
- const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
10116
- iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
10117
- const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
10118
- iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
10119
- const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
10120
- iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
10121
- const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
10122
- iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
10123
- qs += 16;
10124
9379
 
10125
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9380
+ const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
9381
+ idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
9382
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9383
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
9384
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9385
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
9386
+ idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
9387
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9388
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
9389
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9390
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
9391
+
9392
+
9393
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
10126
9394
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10127
9395
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10128
- vs.val[0] = vceqq_u8(vs.val[0], mask2);
10129
- vs.val[1] = vceqq_u8(vs.val[1], mask2);
9396
+ vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
9397
+ vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
10130
9398
 
10131
- q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
10132
- q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
9399
+ q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
9400
+ q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
10133
9401
 
10134
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9402
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
10135
9403
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10136
9404
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10137
- vs.val[0] = vceqq_u8(vs.val[0], mask2);
10138
- vs.val[1] = vceqq_u8(vs.val[1], mask2);
9405
+ vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
9406
+ vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
10139
9407
 
10140
9408
  signs += 4;
10141
9409
 
10142
- q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
10143
- q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
9410
+ q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
9411
+ q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
10144
9412
 
10145
9413
  const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
10146
9414
  const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
9415
+ #if QK_K == 256
9416
+ sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
9417
+ sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
9418
+ #else
10147
9419
  sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
10148
9420
  sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4));
9421
+ #endif
10149
9422
  }
10150
9423
  sumf += d*(sumi1 + sumi2);
10151
9424
  }
10152
- *s = 0.25f * sumf;
9425
+ *s = sumf;
10153
9426
 
10154
9427
  #elif defined(__AVX2__)
10155
9428
 
@@ -10164,6 +9437,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10164
9437
  const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
10165
9438
  const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
10166
9439
 
9440
+ const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
9441
+ const __m256i idx_mask = _mm256_set1_epi32(256);
9442
+
9443
+ typedef union {
9444
+ __m256i vec[2];
9445
+ uint32_t index[16];
9446
+ } index_t;
9447
+
9448
+ index_t idx;
9449
+
10167
9450
  __m256 accumf = _mm256_setzero_ps();
10168
9451
  for (int i = 0; i < nb; ++i) {
10169
9452
  const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
@@ -10176,24 +9459,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10176
9459
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10177
9460
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10178
9461
  const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10179
- const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
10180
- iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
10181
- iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
10182
- iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
10183
- iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
10184
- iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
10185
- iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
10186
- iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
10187
- qs += 8;
10188
- const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
10189
- iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
10190
- iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
10191
- iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
10192
- iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
10193
- iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
10194
- iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
10195
- iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
10196
- qs += 8;
9462
+ const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
9463
+ idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
9464
+ idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
9465
+ idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
9466
+ idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
9467
+ idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
9468
+ idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
9469
+
9470
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
9471
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
9472
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
9473
+ const __m256i q2_1 = _mm256_set_epi32(
9474
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
9475
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
9476
+ );
9477
+ const __m256i q2_2 = _mm256_set_epi32(
9478
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
9479
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
9480
+ );
10197
9481
 
10198
9482
  __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
10199
9483
  aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
@@ -10221,7 +9505,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10221
9505
 
10222
9506
  }
10223
9507
 
10224
- *s = 0.25f * hsum_float_8(accumf);
9508
+ *s = hsum_float_8(accumf);
10225
9509
 
10226
9510
  #else
10227
9511
 
@@ -10238,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10238
9522
  const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
10239
9523
  int32_t sumi = 0;
10240
9524
  for (int l = 0; l < 4; ++l) {
10241
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
10242
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
9525
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
9526
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
10243
9527
  for (int j = 0; j < 4; ++j) {
10244
9528
  sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10245
9529
  sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
@@ -10251,8 +9535,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10251
9535
  bsum += sumi * ls1;
10252
9536
  sumi = 0;
10253
9537
  for (int l = 0; l < 4; ++l) {
10254
- const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
10255
- const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
9538
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
9539
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
10256
9540
  for (int j = 0; j < 4; ++j) {
10257
9541
  sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
10258
9542
  sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
@@ -10265,7 +9549,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10265
9549
  }
10266
9550
  sumf += d * bsum;
10267
9551
  }
10268
- *s = 0.25f * sumf;
9552
+ *s = sumf;
10269
9553
  #endif
10270
9554
  }
10271
9555
 
@@ -10278,7 +9562,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10278
9562
  }
10279
9563
  #endif
10280
9564
 
10281
- void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9565
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10282
9566
  assert(n % QK_K == 0);
10283
9567
  assert(nrc == 1);
10284
9568
  UNUSED(nrc);
@@ -10291,155 +9575,119 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
10291
9575
 
10292
9576
  const int nb = n / QK_K;
10293
9577
 
10294
- // TODO: implement for QK_K = 64
10295
- #if defined __ARM_NEON && QK_K == 256
10296
-
10297
- const uint8x16_t m8 = vdupq_n_u8(0x08);
10298
- const uint8x16_t m7 = vdupq_n_u8(0x07);
10299
- const uint8x16_t m1 = vdupq_n_u8(0x01);
10300
- const int32x4_t vzero = vdupq_n_s32(0);
9578
+ #if defined __ARM_NEON
10301
9579
 
10302
- uint16_t gindex[8];
10303
- uint16x8x2_t vindex;
10304
- int8x16x4_t q1b;
9580
+ ggml_int8x16x4_t q1b;
10305
9581
  ggml_int8x16x4_t q8b;
10306
- uint16x8x4_t scales;
10307
- int32x4x2_t sumi;
10308
- int32x4x2_t dotq;
10309
9582
 
10310
9583
  float sumf = 0;
10311
9584
  for (int i = 0; i < nb; ++i) {
10312
9585
 
10313
- const int8_t * q8 = y[i].qs;
10314
- const uint8_t * qs = x[i].qs;
10315
- const uint8_t * sc = x[i].scales;
9586
+ const int8_t * q8 = y[i].qs;
9587
+ const uint8_t * qs = x[i].qs;
9588
+ const uint16_t * qh = x[i].qh;
10316
9589
 
10317
- sumi.val[0] = sumi.val[1] = vzero;
9590
+ int sumi1 = 0, sumi2 = 0, sumi3 = 0;
10318
9591
 
10319
- for (int i128 = 0; i128 < QK_K/128; ++i128) {
10320
- const uint8x16_t ql = vld1q_u8(qs); qs += 16;
10321
- const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
10322
- const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
10323
- const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
10324
- const uint8x16_t hbit = vandq_u8(qh, m8);
10325
- vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
10326
- vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
10327
- const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
10328
- scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
10329
- scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
9592
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10330
9593
 
10331
- for (int l = 0; l < 2; ++l) {
10332
- vst1q_u16(gindex+0, vindex.val[l]);
10333
- q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
10334
- q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
10335
- q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
10336
- q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
10337
- q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9594
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
9595
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
9596
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
9597
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
9598
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
9599
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
9600
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
9601
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
9602
+ qs += 8;
10338
9603
 
10339
- dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
10340
- dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
9604
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9605
+
9606
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
9607
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
9608
+
9609
+ const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
9610
+ const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
9611
+ sumi1 += vaddvq_s32(p1) * ls1;
9612
+ sumi2 += vaddvq_s32(p2) * ls2;
9613
+ sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
9614
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
10341
9615
 
10342
- sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
10343
- sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
10344
- }
10345
9616
  }
10346
9617
 
10347
- sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
9618
+ sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
10348
9619
  }
10349
9620
 
10350
9621
  *s = sumf;
10351
9622
 
10352
- // TODO: implement for QK_K = 64
10353
- #elif defined __AVX2__ && QK_K == 256
10354
-
10355
- const __m128i m8 = _mm_set1_epi8(0x08);
10356
- const __m128i m7 = _mm_set1_epi8(0x07);
10357
- const __m128i m1 = _mm_set1_epi8(0x01);
10358
- const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
10359
- const __m128i shuffle_s[4] = {
10360
- _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
10361
- _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
10362
- _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
10363
- _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
10364
- };
10365
-
10366
- uint64_t aux64;
10367
-
10368
- typedef union m256i_uint16 {
10369
- __m256i reg;
10370
- uint16_t s[16];
10371
- } m256i_uint16_t;
10372
-
10373
- m256i_uint16_t v_gindex;
9623
+ #elif defined __AVX2__
10374
9624
 
10375
9625
  __m256 accum = _mm256_setzero_ps();
9626
+ float accum1 = 0;
10376
9627
  for (int i = 0; i < nb; ++i) {
10377
9628
 
10378
- const int8_t * q8 = y[i].qs;
10379
- const uint8_t * qs = x[i].qs;
10380
- const uint8_t * sc = x[i].scales;
9629
+ const int8_t * q8 = y[i].qs;
9630
+ const uint8_t * qs = x[i].qs;
9631
+ const uint16_t * qh = x[i].qh;
10381
9632
 
10382
9633
  __m256i sumi = _mm256_setzero_si256();
10383
- for (int i128 = 0; i128 < QK_K/128; ++i128) {
10384
- const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10385
- memcpy(&aux64, sc, 8); sc += 8;
10386
- const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
10387
- const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
10388
- v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
10389
- const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
9634
+ int sumi1 = 0;
9635
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9636
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
9637
+ iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
9638
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
9639
+ iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
9640
+ qs += 8;
9641
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9642
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10390
9643
 
10391
- for (int i32 = 0; i32 < 4; ++i32) {
10392
- const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10393
- const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
10394
- iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
10395
- const __m256i dot = mul_add_epi8(q1b, q8b);
10396
- const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
10397
- const __m256i p = _mm256_madd_epi16(s16, dot);
10398
- sumi = _mm256_add_epi32(sumi, p);
10399
- }
9644
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
9645
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
9646
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
9647
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
9648
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
9649
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
10400
9650
 
9651
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
9652
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
9653
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
10401
9654
  }
10402
9655
 
10403
- accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
9656
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
9657
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
9658
+ accum1 += d * sumi1;
10404
9659
 
10405
9660
  }
10406
9661
 
10407
- *s = hsum_float_8(accum);
9662
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
10408
9663
 
10409
9664
  #else
10410
9665
 
10411
- int db[4];
10412
- uint16_t idx[4];
10413
-
10414
9666
  float sumf = 0;
10415
- for (int i = 0; i < nb; ++i) {
9667
+ for (int i = 0; i < nb; i++) {
10416
9668
 
10417
- const int8_t * q8 = y[i].qs;
10418
- const uint8_t * qs = x[i].qs;
10419
- const uint8_t * sc = x[i].scales;
9669
+ const int8_t * q8 = y[i].qs;
9670
+ const uint8_t * qs = x[i].qs;
9671
+ const uint16_t * qh = x[i].qh;
10420
9672
 
10421
- int sumi = 0;
10422
- for (int i32 = 0; i32 < QK_K/32; ++i32) {
10423
- idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
10424
- idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
10425
- idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
10426
- idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
10427
- db[0] = (2*(sc[0] & 7) + 1);
10428
- db[1] = (2*((sc[0] >> 4) & 7) + 1);
10429
- db[2] = (2*(sc[1] & 7) + 1);
10430
- db[3] = (2*((sc[1] >> 4) & 7) + 1);
9673
+ int sumi = 0, sumi1 = 0;
9674
+ for (int ib = 0; ib < QK_K/32; ++ib) {
9675
+ const int ls = 2*((qh[ib] >> 12) & 7) + 1;
9676
+ const int delta = qh[ib] & 0x8000 ? -1 : 1;
9677
+ int lsum = 0;
10431
9678
  for (int l = 0; l < 4; ++l) {
10432
- const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
10433
- int suml = 0;
10434
- for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
10435
- sumi += db[l] * suml;
9679
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
9680
+ for (int j = 0; j < 8; ++j) {
9681
+ lsum += q8[j] * grid[j];
9682
+ }
10436
9683
  q8 += 8;
10437
9684
  }
9685
+ sumi += ls * lsum;
9686
+ sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
10438
9687
  qs += 4;
10439
- sc += 2;
10440
9688
  }
10441
9689
 
10442
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
9690
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
10443
9691
  }
10444
9692
 
10445
9693
  *s = sumf;
@@ -10508,10 +9756,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10508
9756
  const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
10509
9757
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
10510
9758
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
10511
- const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10512
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10513
- const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10514
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
9759
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
9760
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
9761
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
9762
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10515
9763
  const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10516
9764
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10517
9765
  const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
@@ -10618,10 +9866,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
10618
9866
  const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10619
9867
  const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10620
9868
  const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
10621
- const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
10622
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
10623
- const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
10624
- _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
9869
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
9870
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
9871
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
9872
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
10625
9873
  const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
10626
9874
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
10627
9875
  const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
@@ -10700,7 +9948,7 @@ static inline int iq2_grid_size(enum ggml_type type) {
10700
9948
  GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10701
9949
  return type == GGML_TYPE_IQ2_XXS ? 256 :
10702
9950
  type == GGML_TYPE_IQ2_XS ? 512 :
10703
- type == GGML_TYPE_IQ1_S ? 512 : 1024;
9951
+ type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
10704
9952
  }
10705
9953
 
10706
9954
  static int iq2_compare_func(const void * left, const void * right) {
@@ -10767,39 +10015,135 @@ void iq2xs_init_impl(enum ggml_type type) {
10767
10015
  40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
10768
10016
  42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
10769
10017
  };
10770
- static const uint16_t kgrid_1bit_512[512] = {
10771
- 10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545,
10772
- 553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444,
10773
- 1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440,
10774
- 2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422,
10775
- 4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397,
10776
- 5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769,
10777
- 5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788,
10778
- 6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794,
10779
- 9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272,
10780
- 10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
10781
- 16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
10782
- 17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
10783
- 18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
10784
- 20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
10785
- 20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
10786
- 21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
10787
- 21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
10788
- 21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
10789
- 22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
10790
- 23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
10791
- 25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
10792
- 25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
10793
- 26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
10794
- 32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
10795
- 33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
10796
- 34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
10797
- 37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
10798
- 38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
10799
- 38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
10800
- 39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
10801
- 41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
10802
- 42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
10018
+ static const uint16_t kgrid_1bit_2048[NGRID_IQ1S] = {
10019
+ 0, 2, 5, 8, 10, 17, 21, 32, 34, 40, 42, 69, 81, 84, 86, 101,
10020
+ 128, 130, 136, 138, 149, 160, 162, 168, 170, 260, 261, 273, 276, 278, 281, 282,
10021
+ 293, 321, 326, 329, 338, 341, 346, 353, 356, 358, 360, 389, 401, 404, 406, 421,
10022
+ 512, 514, 520, 522, 533, 544, 546, 552, 554, 581, 593, 601, 612, 617, 640, 642,
10023
+ 648, 650, 657, 661, 665, 672, 674, 680, 682, 1041, 1044, 1046, 1061, 1089, 1097, 1109,
10024
+ 1114, 1124, 1125, 1169, 1177, 1189, 1281, 1284, 1285, 1286, 1301, 1304, 1306, 1321, 1344, 1349,
10025
+ 1354, 1360, 1361, 1364, 1365, 1366, 1369, 1376, 1378, 1381, 1384, 1386, 1409, 1425, 1429, 1432,
10026
+ 1434, 1441, 1444, 1445, 1446, 1449, 1556, 1561, 1601, 1604, 1616, 1618, 1621, 1624, 1632, 1633,
10027
+ 1638, 1641, 1669, 1681, 1684, 1689, 2048, 2050, 2056, 2058, 2069, 2080, 2082, 2088, 2090, 2117,
10028
+ 2129, 2134, 2149, 2176, 2178, 2184, 2186, 2197, 2208, 2210, 2216, 2218, 2309, 2321, 2324, 2329,
10029
+ 2340, 2341, 2369, 2384, 2385, 2389, 2401, 2404, 2409, 2449, 2452, 2454, 2457, 2469, 2560, 2562,
10030
+ 2568, 2570, 2581, 2592, 2594, 2600, 2602, 2629, 2641, 2649, 2657, 2661, 2688, 2690, 2693, 2696,
10031
+ 2698, 2709, 2720, 2722, 2728, 2730, 4112, 4113, 4116, 4121, 4132, 4133, 4161, 4164, 4176, 4181,
10032
+ 4184, 4193, 4196, 4197, 4201, 4241, 4244, 4246, 4257, 4261, 4353, 4356, 4358, 4361, 4368, 4370,
10033
+ 4373, 4376, 4385, 4388, 4393, 4421, 4426, 4432, 4433, 4434, 4436, 4437, 4438, 4441, 4448, 4453,
10034
+ 4484, 4498, 4501, 4513, 4516, 4625, 4628, 4630, 4645, 4672, 4678, 4681, 4690, 4693, 4696, 4698,
10035
+ 4708, 4710, 4741, 4753, 4756, 4758, 4773, 5121, 5126, 5129, 5140, 5141, 5144, 5145, 5153, 5158,
10036
+ 5185, 5189, 5190, 5192, 5194, 5201, 5204, 5205, 5206, 5209, 5218, 5221, 5224, 5252, 5257, 5264,
10037
+ 5268, 5269, 5272, 5273, 5274, 5281, 5284, 5285, 5289, 5378, 5381, 5386, 5393, 5396, 5397, 5398,
10038
+ 5401, 5408, 5410, 5413, 5416, 5418, 5441, 5444, 5445, 5446, 5457, 5458, 5460, 5461, 5462, 5465,
10039
+ 5466, 5473, 5476, 5477, 5478, 5481, 5504, 5506, 5508, 5509, 5512, 5514, 5520, 5521, 5524, 5525,
10040
+ 5526, 5529, 5530, 5536, 5538, 5541, 5633, 5636, 5637, 5638, 5653, 5654, 5656, 5658, 5665, 5670,
10041
+ 5696, 5698, 5700, 5701, 5704, 5706, 5713, 5717, 5718, 5720, 5721, 5729, 5732, 5733, 5736, 5737,
10042
+ 5738, 5766, 5770, 5778, 5781, 5796, 5801, 6161, 6166, 6181, 6209, 6212, 6214, 6217, 6224, 6229,
10043
+ 6232, 6234, 6240, 6241, 6244, 6246, 6249, 6277, 6289, 6292, 6309, 6416, 6418, 6421, 6426, 6433,
10044
+ 6437, 6466, 6468, 6469, 6472, 6481, 6484, 6485, 6486, 6489, 6490, 6496, 6501, 6506, 6537, 6545,
10045
+ 6546, 6549, 6552, 6561, 6566, 6569, 6665, 6678, 6692, 6694, 6724, 6726, 6729, 6736, 6738, 6741,
10046
+ 6744, 6753, 6758, 6761, 6789, 6801, 6806, 6810, 8192, 8194, 8200, 8202, 8213, 8224, 8226, 8229,
10047
+ 8232, 8234, 8261, 8273, 8281, 8289, 8293, 8320, 8322, 8328, 8330, 8341, 8352, 8354, 8357, 8360,
10048
+ 8362, 8453, 8465, 8468, 8473, 8485, 8514, 8516, 8521, 8533, 8536, 8538, 8545, 8548, 8549, 8550,
10049
+ 8581, 8592, 8598, 8601, 8613, 8705, 8712, 8714, 8721, 8725, 8736, 8738, 8744, 8746, 8773, 8785,
10050
+ 8790, 8793, 8805, 8833, 8840, 8842, 8849, 8853, 8864, 8866, 8872, 8874, 9221, 9236, 9238, 9241,
10051
+ 9253, 9284, 9285, 9286, 9289, 9298, 9301, 9304, 9306, 9318, 9349, 9361, 9364, 9369, 9377, 9381,
10052
+ 9481, 9493, 9505, 9513, 9536, 9541, 9544, 9553, 9556, 9557, 9561, 9570, 9573, 9576, 9609, 9616,
10053
+ 9620, 9621, 9624, 9626, 9633, 9636, 9638, 9641, 9733, 9744, 9746, 9753, 9765, 9793, 9801, 9813,
10054
+ 9824, 9825, 9833, 9860, 9862, 9872, 9882, 10240, 10242, 10248, 10250, 10261, 10272, 10274, 10280, 10282,
10055
+ 10309, 10321, 10324, 10341, 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410, 10505, 10513, 10516, 10521,
10056
+ 10533, 10566, 10569, 10578, 10581, 10593, 10596, 10598, 10601, 10629, 10640, 10646, 10649, 10660, 10661, 10752,
10057
+ 10754, 10760, 10762, 10784, 10786, 10792, 10794, 10821, 10833, 10838, 10841, 10853, 10880, 10882, 10888, 10890,
10058
+ 10901, 10912, 10914, 10920, 10922, 16389, 16401, 16406, 16421, 16457, 16466, 16469, 16472, 16474, 16481, 16484,
10059
+ 16486, 16532, 16537, 16545, 16550, 16640, 16641, 16644, 16646, 16649, 16658, 16661, 16662, 16664, 16666, 16673,
10060
+ 16678, 16681, 16709, 16712, 16714, 16721, 16724, 16725, 16726, 16729, 16730, 16741, 16744, 16746, 16769, 16772,
10061
+ 16774, 16784, 16786, 16789, 16800, 16801, 16802, 16901, 16913, 16916, 16918, 16933, 16961, 16978, 16981, 16986,
10062
+ 16996, 17001, 17033, 17044, 17061, 17409, 17429, 17433, 17449, 17477, 17480, 17482, 17489, 17492, 17493, 17494,
10063
+ 17505, 17506, 17509, 17512, 17514, 17537, 17542, 17545, 17552, 17554, 17557, 17568, 17569, 17577, 17665, 17666,
10064
+ 17669, 17674, 17681, 17684, 17685, 17686, 17689, 17696, 17701, 17706, 17729, 17732, 17733, 17734, 17737, 17744,
10065
+ 17745, 17748, 17749, 17750, 17752, 17753, 17761, 17764, 17765, 17766, 17769, 17794, 17796, 17797, 17800, 17809,
10066
+ 17812, 17813, 17814, 17817, 17818, 17829, 17832, 17834, 17921, 17925, 17929, 17940, 17941, 17944, 17946, 17953,
10067
+ 17956, 17961, 17984, 17986, 17989, 17992, 18000, 18001, 18002, 18005, 18006, 18009, 18018, 18021, 18024, 18049,
10068
+ 18053, 18058, 18068, 18069, 18081, 18084, 18086, 18437, 18449, 18453, 18458, 18469, 18498, 18505, 18512, 18517,
10069
+ 18520, 18529, 18532, 18534, 18537, 18565, 18577, 18580, 18582, 18585, 18597, 18689, 18693, 18694, 18698, 18704,
10070
+ 18708, 18709, 18712, 18721, 18724, 18726, 18752, 18757, 18762, 18769, 18770, 18772, 18773, 18774, 18777, 18784,
10071
+ 18786, 18789, 18790, 18794, 18822, 18825, 18834, 18837, 18838, 18840, 18849, 18852, 18854, 18857, 18966, 19012,
10072
+ 19014, 19017, 19029, 19032, 19034, 19044, 19049, 19092, 19109, 20481, 20484, 20485, 20486, 20489, 20498, 20501,
10073
+ 20506, 20513, 20516, 20521, 20544, 20549, 20552, 20561, 20564, 20565, 20566, 20569, 20581, 20584, 20614, 20617,
10074
+ 20629, 20632, 20640, 20641, 20646, 20649, 20741, 20744, 20745, 20746, 20753, 20756, 20757, 20758, 20760, 20761,
10075
+ 20768, 20773, 20774, 20776, 20778, 20801, 20804, 20805, 20806, 20809, 20816, 20817, 20818, 20820, 20821, 20822,
10076
+ 20824, 20825, 20826, 20833, 20836, 20837, 20838, 20841, 20866, 20869, 20881, 20884, 20885, 20886, 20889, 20896,
10077
+ 20901, 20906, 20993, 20998, 21010, 21013, 21018, 21025, 21028, 21058, 21061, 21066, 21073, 21076, 21077, 21078,
10078
+ 21081, 21090, 21093, 21125, 21136, 21138, 21141, 21145, 21146, 21156, 21508, 21509, 21521, 21524, 21525, 21526,
10079
+ 21528, 21529, 21537, 21541, 21544, 21546, 21569, 21572, 21573, 21574, 21577, 21578, 21584, 21585, 21588, 21589,
10080
+ 21590, 21592, 21593, 21594, 21601, 21602, 21604, 21605, 21606, 21609, 21632, 21640, 21642, 21649, 21652, 21653,
10081
+ 21654, 21657, 21665, 21668, 21669, 21674, 21761, 21762, 21764, 21765, 21766, 21769, 21776, 21777, 21778, 21780,
10082
+ 21781, 21782, 21785, 21786, 21793, 21796, 21797, 21798, 21801, 21824, 21825, 21826, 21828, 21829, 21830, 21832,
10083
+ 21833, 21840, 21841, 21842, 21844, 21845, 21846, 21848, 21849, 21850, 21856, 21857, 21860, 21861, 21862, 21864,
10084
+ 21865, 21866, 21889, 21892, 21893, 21897, 21898, 21904, 21905, 21908, 21909, 21910, 21912, 21913, 21921, 21924,
10085
+ 21925, 21926, 21929, 22016, 22017, 22018, 22020, 22022, 22024, 22025, 22033, 22036, 22037, 22040, 22041, 22048,
10086
+ 22049, 22050, 22052, 22053, 22054, 22056, 22057, 22081, 22085, 22086, 22088, 22089, 22090, 22096, 22097, 22098,
10087
+ 22100, 22101, 22102, 22104, 22105, 22106, 22113, 22116, 22117, 22121, 22146, 22149, 22150, 22152, 22153, 22154,
10088
+ 22161, 22165, 22170, 22178, 22181, 22182, 22184, 22185, 22532, 22533, 22534, 22537, 22544, 22549, 22552, 22561,
10089
+ 22570, 22597, 22600, 22602, 22609, 22612, 22613, 22614, 22616, 22617, 22624, 22626, 22628, 22629, 22658, 22665,
10090
+ 22672, 22674, 22677, 22680, 22689, 22697, 22785, 22786, 22789, 22794, 22801, 22804, 22805, 22806, 22809, 22821,
10091
+ 22849, 22852, 22853, 22854, 22857, 22864, 22865, 22866, 22868, 22869, 22870, 22872, 22873, 22874, 22881, 22884,
10092
+ 22885, 22886, 22889, 22913, 22917, 22921, 22929, 22932, 22933, 22934, 22936, 22937, 22949, 23044, 23048, 23061,
10093
+ 23066, 23072, 23077, 23078, 23081, 23109, 23112, 23113, 23121, 23125, 23126, 23128, 23129, 23138, 23141, 23144,
10094
+ 23146, 23169, 23178, 23186, 23189, 23190, 23192, 23194, 23201, 24581, 24596, 24598, 24601, 24613, 24644, 24656,
10095
+ 24661, 24662, 24664, 24666, 24673, 24676, 24678, 24681, 24705, 24726, 24741, 24833, 24836, 24838, 24841, 24850,
10096
+ 24853, 24865, 24866, 24870, 24873, 24901, 24905, 24913, 24917, 24918, 24921, 24933, 24934, 24938, 24964, 24970,
10097
+ 24978, 24981, 24993, 24998, 25001, 25105, 25110, 25113, 25152, 25153, 25158, 25173, 25174, 25176, 25184, 25221,
10098
+ 25233, 25238, 25253, 25617, 25618, 25621, 25622, 25626, 25633, 25638, 25641, 25664, 25666, 25669, 25672, 25674,
10099
+ 25681, 25684, 25685, 25686, 25689, 25690, 25696, 25698, 25701, 25732, 25733, 25737, 25744, 25746, 25748, 25749,
10100
+ 25750, 25752, 25754, 25761, 25764, 25769, 25861, 25864, 25866, 25873, 25877, 25878, 25881, 25924, 25925, 25926,
10101
+ 25929, 25936, 25937, 25940, 25941, 25942, 25945, 25953, 25956, 25957, 25958, 25961, 25990, 25993, 25994, 26001,
10102
+ 26005, 26006, 26009, 26010, 26018, 26021, 26022, 26024, 26114, 26121, 26133, 26144, 26150, 26152, 26153, 26176,
10103
+ 26181, 26184, 26186, 26193, 26196, 26197, 26198, 26200, 26202, 26208, 26213, 26216, 26240, 26242, 26245, 26250,
10104
+ 26260, 26262, 26264, 26265, 26272, 26276, 26278, 26282, 26646, 26649, 26661, 26689, 26706, 26709, 26714, 26721,
10105
+ 26729, 26757, 26769, 26776, 26790, 26881, 26884, 26896, 26901, 26913, 26916, 26918, 26921, 26944, 26945, 26949,
10106
+ 26950, 26952, 26961, 26964, 26965, 26966, 26969, 26976, 26981, 26986, 27010, 27012, 27018, 27029, 27041, 27044,
10107
+ 27045, 27049, 27153, 27158, 27160, 27201, 27204, 27209, 27216, 27221, 27224, 27226, 27236, 27237, 27241, 27270,
10108
+ 27284, 27288, 27290, 27302, 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810, 32837, 32848, 32849, 32852,
10109
+ 32854, 32857, 32869, 32896, 32898, 32904, 32906, 32917, 32928, 32930, 32936, 32938, 33029, 33041, 33044, 33046,
10110
+ 33049, 33061, 33089, 33092, 33097, 33104, 33106, 33109, 33110, 33112, 33113, 33124, 33126, 33129, 33157, 33161,
10111
+ 33172, 33174, 33177, 33189, 33280, 33282, 33288, 33290, 33301, 33312, 33314, 33320, 33322, 33361, 33364, 33369,
10112
+ 33381, 33408, 33410, 33416, 33418, 33429, 33440, 33442, 33448, 33450, 33812, 33817, 33857, 33860, 33873, 33877,
10113
+ 33882, 33889, 33892, 33897, 33940, 33945, 34049, 34057, 34066, 34069, 34074, 34086, 34089, 34112, 34113, 34117,
10114
+ 34120, 34129, 34132, 34133, 34134, 34137, 34138, 34149, 34150, 34152, 34154, 34177, 34180, 34182, 34185, 34192,
10115
+ 34194, 34197, 34200, 34214, 34321, 34326, 34329, 34341, 34369, 34372, 34377, 34378, 34384, 34389, 34393, 34394,
10116
+ 34401, 34406, 34410, 34437, 34449, 34458, 34468, 34816, 34818, 34824, 34826, 34837, 34848, 34850, 34856, 34858,
10117
+ 34881, 34885, 34897, 34900, 34905, 34917, 34921, 34944, 34946, 34952, 34954, 34965, 34976, 34978, 34984, 34986,
10118
+ 35077, 35078, 35089, 35092, 35094, 35109, 35137, 35140, 35142, 35145, 35152, 35154, 35157, 35162, 35169, 35172,
10119
+ 35205, 35222, 35225, 35237, 35328, 35330, 35336, 35338, 35349, 35360, 35362, 35368, 35370, 35397, 35409, 35412,
10120
+ 35414, 35456, 35458, 35464, 35466, 35477, 35488, 35490, 35496, 35498, 36869, 36881, 36886, 36888, 36889, 36901,
10121
+ 36929, 36934, 36937, 36949, 36952, 36954, 36969, 36970, 36997, 37009, 37012, 37014, 37017, 37029, 37121, 37124,
10122
+ 37126, 37129, 37136, 37141, 37144, 37146, 37153, 37156, 37158, 37161, 37184, 37189, 37200, 37201, 37204, 37205,
10123
+ 37206, 37209, 37218, 37221, 37252, 37254, 37266, 37269, 37272, 37281, 37284, 37286, 37289, 37381, 37393, 37396,
10124
+ 37401, 37413, 37444, 37446, 37449, 37456, 37458, 37461, 37464, 37478, 37481, 37509, 37524, 37526, 37545, 37889,
10125
+ 37892, 37894, 37904, 37909, 37912, 37926, 37952, 37962, 37969, 37972, 37973, 37974, 37976, 37977, 37984, 37985,
10126
+ 37986, 37989, 38020, 38022, 38034, 38036, 38037, 38040, 38049, 38057, 38144, 38149, 38152, 38154, 38160, 38161,
10127
+ 38164, 38165, 38166, 38169, 38177, 38181, 38185, 38186, 38209, 38212, 38213, 38214, 38217, 38224, 38225, 38226,
10128
+ 38228, 38229, 38230, 38232, 38233, 38234, 38241, 38244, 38245, 38246, 38249, 38273, 38277, 38280, 38289, 38290,
10129
+ 38292, 38293, 38294, 38297, 38298, 38304, 38306, 38309, 38312, 38314, 38401, 38404, 38416, 38421, 38425, 38432,
10130
+ 38438, 38441, 38469, 38472, 38473, 38481, 38482, 38485, 38486, 38489, 38501, 38504, 38530, 38532, 38537, 38538,
10131
+ 38546, 38548, 38549, 38564, 38566, 38569, 38917, 38934, 38937, 38949, 38977, 38982, 38992, 38994, 38997, 38998,
10132
+ 39002, 39012, 39013, 39045, 39057, 39062, 39065, 39077, 39172, 39174, 39177, 39184, 39186, 39189, 39192, 39194,
10133
+ 39200, 39201, 39204, 39206, 39232, 39234, 39237, 39240, 39242, 39249, 39252, 39253, 39254, 39257, 39266, 39269,
10134
+ 39270, 39274, 39297, 39300, 39312, 39314, 39317, 39322, 39329, 39334, 39429, 39445, 39461, 39492, 39494, 39497,
10135
+ 39504, 39509, 39512, 39521, 39557, 39569, 39572, 39573, 39574, 40960, 40962, 40968, 40970, 40981, 40992, 40994,
10136
+ 41000, 41002, 41029, 41041, 41044, 41046, 41049, 41088, 41090, 41096, 41098, 41109, 41120, 41122, 41128, 41130,
10137
+ 41221, 41225, 41233, 41236, 41238, 41241, 41242, 41286, 41289, 41297, 41301, 41304, 41306, 41313, 41316, 41349,
10138
+ 41360, 41362, 41366, 41369, 41474, 41480, 41482, 41488, 41497, 41506, 41512, 41514, 41541, 41553, 41558, 41561,
10139
+ 41573, 41600, 41602, 41608, 41610, 41621, 41632, 41634, 41640, 41642, 42009, 42021, 42049, 42052, 42064, 42068,
10140
+ 42069, 42072, 42074, 42081, 42085, 42086, 42088, 42089, 42117, 42246, 42249, 42256, 42258, 42261, 42264, 42278,
10141
+ 42281, 42306, 42309, 42321, 42324, 42325, 42326, 42329, 42341, 42346, 42369, 42372, 42373, 42374, 42377, 42386,
10142
+ 42389, 42392, 42501, 42513, 42518, 42522, 42529, 42533, 42564, 42566, 42570, 42578, 42581, 42582, 42584, 42592,
10143
+ 42594, 42630, 42640, 42645, 42646, 42649, 42657, 42660, 42662, 43008, 43010, 43016, 43018, 43040, 43042, 43048,
10144
+ 43050, 43089, 43092, 43094, 43097, 43136, 43138, 43144, 43146, 43157, 43168, 43170, 43176, 43178, 43269, 43284,
10145
+ 43289, 43297, 43301, 43329, 43344, 43349, 43354, 43361, 43366, 43369, 43408, 43414, 43520, 43522, 43528, 43530,
10146
+ 43552, 43554, 43560, 43562, 43601, 43604, 43606, 43648, 43650, 43656, 43658, 43669, 43680, 43682, 43688, 43690,
10803
10147
  };
10804
10148
  static const uint16_t kgrid_2bit_1024[1024] = {
10805
10149
  0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
@@ -10873,12 +10217,12 @@ void iq2xs_init_impl(enum ggml_type type) {
10873
10217
  const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10874
10218
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10875
10219
  type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10876
- type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
10220
+ type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
10877
10221
  uint64_t * kgrid_q2xs;
10878
10222
  int * kmap_q2xs;
10879
10223
  uint16_t * kneighbors_q2xs;
10880
10224
 
10881
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10225
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10882
10226
  uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
10883
10227
  for (int k = 0; k < grid_size; ++k) {
10884
10228
  int8_t * pos = (int8_t *)(the_grid + k);
@@ -10933,7 +10277,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10933
10277
  }
10934
10278
  num_neighbors += n;
10935
10279
  }
10936
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10280
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10937
10281
  kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10938
10282
  iq2_data[gindex].neighbours = kneighbors_q2xs;
10939
10283
  int counter = 0;
@@ -11356,8 +10700,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
11356
10700
  }
11357
10701
  }
11358
10702
 
11359
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11360
- (void)hist;
10703
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11361
10704
  GGML_ASSERT(n_per_row%QK_K == 0);
11362
10705
  int nblock = n_per_row/QK_K;
11363
10706
  char * qrow = (char *)dst;
@@ -11369,8 +10712,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
11369
10712
  return nrow * nblock * sizeof(block_iq2_xxs);
11370
10713
  }
11371
10714
 
11372
- size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11373
- (void)hist;
10715
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11374
10716
  GGML_ASSERT(n_per_row%QK_K == 0);
11375
10717
  int nblock = n_per_row/QK_K;
11376
10718
  char * qrow = (char *)dst;
@@ -11474,7 +10816,7 @@ void iq3xs_init_impl(int grid_size) {
11474
10816
  int * kmap_q3xs;
11475
10817
  uint16_t * kneighbors_q3xs;
11476
10818
 
11477
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10819
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
11478
10820
  uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
11479
10821
  for (int k = 0; k < grid_size; ++k) {
11480
10822
  int8_t * pos = (int8_t *)(the_grid + k);
@@ -11529,7 +10871,7 @@ void iq3xs_init_impl(int grid_size) {
11529
10871
  }
11530
10872
  num_neighbors += n;
11531
10873
  }
11532
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10874
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
11533
10875
  kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
11534
10876
  iq3_data[gindex].neighbours = kneighbors_q3xs;
11535
10877
  int counter = 0;
@@ -11812,8 +11154,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11812
11154
  }
11813
11155
  }
11814
11156
 
11815
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11816
- (void)hist;
11157
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11817
11158
  GGML_ASSERT(n_per_row%QK_K == 0);
11818
11159
  int nblock = n_per_row/QK_K;
11819
11160
  char * qrow = (char *)dst;
@@ -11912,7 +11253,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11912
11253
  }
11913
11254
  float best = 0;
11914
11255
  float scale = max/(2*kMaxQ-1);
11915
- for (int is = -15; is <= 15; ++is) {
11256
+ for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
11257
+ for (int is = -9; is <= 9; ++is) {
11916
11258
  float id = (2*kMaxQ-1+is*0.2f)/max;
11917
11259
  float this_scale = 1/id;
11918
11260
  for (int k = 0; k < bs4; ++k) {
@@ -11948,7 +11290,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11948
11290
  if (n_not_ongrid > 0 && scale > 0) {
11949
11291
  float id = 1/scale;
11950
11292
  for (int k = 0; k < bs4; ++k) {
11951
- if (is_on_grid[k]) continue;
11293
+ //if (is_on_grid[k]) continue;
11952
11294
  uint16_t u = 0;
11953
11295
  for (int i = 0; i < 4; ++i) {
11954
11296
  int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
@@ -12004,7 +11346,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
12004
11346
  }
12005
11347
 
12006
11348
  float d = max_scale/31;
12007
- y[ibl].d = GGML_FP32_TO_FP16(d);
11349
+ y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
12008
11350
  float id = 1/d;
12009
11351
  for (int ib = 0; ib < QK_K/block_size; ib += 2) {
12010
11352
  int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
@@ -12018,8 +11360,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
12018
11360
  }
12019
11361
 
12020
11362
  #define IQ3S_BLOCK_SIZE 32
12021
- size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12022
- (void)hist;
11363
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12023
11364
  GGML_ASSERT(n_per_row%QK_K == 0);
12024
11365
  int nblock = n_per_row/QK_K;
12025
11366
  float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -12049,7 +11390,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
12049
11390
 
12050
11391
  void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
12051
11392
  assert(k % QK_K == 0);
12052
- quantize_iq3_s(x, y, 1, k, NULL, NULL);
11393
+ quantize_iq3_s(x, y, 1, k, NULL);
12053
11394
  }
12054
11395
 
12055
11396
 
@@ -12115,12 +11456,70 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
12115
11456
  return grid_index;
12116
11457
  }
12117
11458
 
11459
+ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
11460
+ const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
11461
+ int num_neighbors = neighbours[0];
11462
+ GGML_ASSERT(num_neighbors > 0);
11463
+ float best_score = FLT_MAX;
11464
+ int grid_index = -1;
11465
+ for (int j = 1; j <= num_neighbors; ++j) {
11466
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
11467
+ float d2 = 0;
11468
+ for (int i = 0; i < 8; ++i) {
11469
+ float q = xg[(pg[i] - 1)/2];
11470
+ float w = weight[i];
11471
+ float diff = scale*q - xval[i];
11472
+ d2 += w*diff*diff;
11473
+ }
11474
+ if (d2 < best_score) {
11475
+ best_score = d2;
11476
+ grid_index = neighbours[j];
11477
+ }
11478
+ }
11479
+ if (grid_index < 0) {
11480
+ for (int i = 0; i < ngrid; ++i) {
11481
+ const int8_t * grid_i = (const int8_t *)(grid + i);
11482
+ float d2 = 0;
11483
+ for (int j = 0; j < 8; ++j) {
11484
+ float w = weight[j];
11485
+ float q = xg[(grid_i[j] - 1)/2];
11486
+ float diff = scale*q - xval[i];
11487
+ d2 += w*diff*diff;
11488
+ }
11489
+ if (d2 < best_score) {
11490
+ best_score = d2;
11491
+ grid_index = i;
11492
+ }
11493
+ }
11494
+ }
11495
+ if (grid_index < 0) {
11496
+ printf("Oops, did not find grid point\n");
11497
+ printf("Have %d neighbours\n", num_neighbors);
11498
+ for (int j = 1; j <= num_neighbors; ++j) {
11499
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
11500
+ float sumqx = 0, sumq2 = 0;
11501
+ for (int i = 0; i < 8; ++i) {
11502
+ float q = xg[(pg[i] - 1)/2];
11503
+ float w = weight[i];
11504
+ sumqx += w*q*xval[i];
11505
+ sumq2 += w*q*q;
11506
+ }
11507
+ printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
11508
+ }
11509
+ }
11510
+ GGML_ASSERT(grid_index >= 0);
11511
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
11512
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
11513
+ return grid_index;
11514
+ }
11515
+
12118
11516
  static int iq1_sort_helper(const void * left, const void * right) {
12119
11517
  const float * l = left;
12120
11518
  const float * r = right;
12121
11519
  return *l < *r ? -1 : *l > *r ? 1 : 0;
12122
11520
  }
12123
11521
 
11522
+ #define IQ1S_BLOCK_SIZE 32
12124
11523
  static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12125
11524
 
12126
11525
  const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
@@ -12139,37 +11538,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
12139
11538
 
12140
11539
  block_iq1_s * y = vy;
12141
11540
 
12142
- float scales[QK_K/8];
12143
- float weight[8];
12144
- int8_t L[8];
12145
- float sumx[9];
12146
- float sumw[9];
12147
- float pairs[16];
11541
+ const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
11542
+ const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
11543
+
11544
+ float scales[QK_K/IQ1S_BLOCK_SIZE];
11545
+ float weight[IQ1S_BLOCK_SIZE];
11546
+ int8_t L[IQ1S_BLOCK_SIZE];
11547
+ float sumx[IQ1S_BLOCK_SIZE+1];
11548
+ float sumw[IQ1S_BLOCK_SIZE+1];
11549
+ float pairs[2*IQ1S_BLOCK_SIZE];
12148
11550
  int * idx = (int *)(pairs + 1);
12149
- uint8_t hbit[QK_K/8];
11551
+ uint16_t index[IQ1S_BLOCK_SIZE/8];
11552
+ int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
12150
11553
 
12151
11554
  for (int ibl = 0; ibl < nbl; ++ibl) {
12152
11555
 
12153
11556
  y[ibl].d = GGML_FP32_TO_FP16(0.f);
12154
11557
  memset(y[ibl].qs, 0, QK_K/8);
12155
- memset(y[ibl].scales, 0, QK_K/16);
11558
+ memset(y[ibl].qh, 0, QK_K/16);
12156
11559
 
12157
11560
  float max_scale = 0;
12158
11561
 
12159
11562
  const float * xbl = x + QK_K*ibl;
12160
11563
  float sumx2 = 0;
12161
11564
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12162
- float sigma2 = sumx2/QK_K;
11565
+ float sigma2 = 2*sumx2/QK_K;
12163
11566
 
12164
- for (int ib = 0; ib < QK_K/8; ++ib) {
12165
- const float * xb = xbl + 8*ib;
12166
- const float * qw = quant_weights + QK_K*ibl + 8*ib;
12167
- for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11567
+ for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11568
+ const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
11569
+ const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
11570
+ for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12168
11571
  float max = fabsf(xb[0]);
12169
- for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
11572
+ for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
12170
11573
  if (!max) {
12171
11574
  scales[ib] = 0;
12172
- memset(L, 1, 8);
11575
+ memset(L, 1, IQ1S_BLOCK_SIZE);
12173
11576
  continue;
12174
11577
  }
12175
11578
  // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
@@ -12178,52 +11581,81 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
12178
11581
  // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
12179
11582
  // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
12180
11583
  // for each possible and score for each split.
12181
- for (int j = 0; j < 8; ++j) {
11584
+ for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
12182
11585
  pairs[2*j] = xb[j];
12183
11586
  idx[2*j] = j;
12184
11587
  }
12185
- qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
11588
+ qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
12186
11589
  {
12187
11590
  sumx[0] = sumw[0] = 0;
12188
- for (int j = 0; j < 8; ++j) {
11591
+ for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
12189
11592
  int i = idx[2*j];
12190
11593
  sumx[j+1] = sumx[j] + weight[i]*xb[i];
12191
11594
  sumw[j+1] = sumw[j] + weight[i];
12192
11595
  }
12193
11596
  }
12194
11597
  float best_score = 0, scale = max;
12195
- int besti1 = 0, besti2 = 0;
12196
- for (int i1 = 0; i1 <= 8; ++i1) {
12197
- for (int i2 = i1; i2 <= 8; ++i2) {
12198
- float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
12199
- float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
11598
+ int besti1 = -1, besti2 = -1, best_shift = 0;
11599
+ for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
11600
+ for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
11601
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
11602
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
12200
11603
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
12201
11604
  scale = sumqx/sumq2; best_score = scale*sumqx;
12202
- besti1 = i1; besti2 = i2;
11605
+ besti1 = i1; besti2 = i2; best_shift = 1;
11606
+ }
11607
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
11608
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
11609
+ if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11610
+ scale = sumqx/sumq2; best_score = scale*sumqx;
11611
+ besti1 = i1; besti2 = i2; best_shift = -1;
12203
11612
  }
12204
11613
  }
12205
11614
  }
11615
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
12206
11616
  for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
12207
11617
  for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
12208
- for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2;
11618
+ for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
12209
11619
  if (scale < 0) {
12210
- for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
12211
- scale = -scale;
11620
+ for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
11621
+ scale = -scale; best_shift = -best_shift;
11622
+ }
11623
+ bool all_on_grid = true;
11624
+ const float * xx = best_shift == 1 ? x_p : x_m;
11625
+ for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11626
+ uint16_t u = 0;
11627
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
11628
+ int grid_index = kmap_q2xs[u];
11629
+ if (grid_index < 0) {
11630
+ all_on_grid = false;
11631
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
11632
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
11633
+ GGML_ASSERT(grid_index >= 0);
11634
+ }
11635
+ index[k] = grid_index;
11636
+ }
11637
+ if (!all_on_grid) {
11638
+ float sumqx = 0, sumq2 = 0;
11639
+ for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11640
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
11641
+ for (int j = 0; j < 8; ++j) {
11642
+ float w = weight[8*k + j];
11643
+ float q = xx[(pg[j] - 1)/2];
11644
+ sumqx += w*q*xb[8*k+j];
11645
+ sumq2 += w*q*q;
11646
+ }
11647
+ }
11648
+ if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
11649
+ }
11650
+ uint16_t h = 0;
11651
+ for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11652
+ y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
11653
+ h |= (index[k] >> 8) << 3*k;
12212
11654
  }
12213
- // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
12214
- // grid point that minimizes SSD.
12215
- uint16_t u = 0;
12216
- for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
12217
- int grid_index = kmap_q2xs[u];
12218
- if (grid_index < 0) {
12219
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12220
- grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
12221
- GGML_ASSERT(grid_index >= 0);
12222
- }
12223
- y[ibl].qs[ib] = grid_index & 255;
12224
- hbit[ib] = grid_index >> 8;
11655
+ y[ibl].qh[ib] = h;
12225
11656
  GGML_ASSERT(scale >= 0);
12226
11657
  scales[ib] = scale;
11658
+ shifts[ib] = best_shift;
12227
11659
  max_scale = MAX(max_scale, scale);
12228
11660
  }
12229
11661
 
@@ -12233,19 +11665,18 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
12233
11665
  }
12234
11666
 
12235
11667
  float d = max_scale/15;
12236
- y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
11668
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
12237
11669
  float id = 1/d;
12238
- for (int ib = 0; ib < QK_K/8; ++ib) {
11670
+ for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
12239
11671
  int l = nearest_int(0.5f*(id*scales[ib]-1));
12240
11672
  l = MAX(0, MIN(7, l));
12241
- if (hbit[ib]) l |= 8;
12242
- y[ibl].scales[ib/2] |= (l << 4*(ib%2));
11673
+ if (shifts[ib] == -1) l |= 8;
11674
+ y[ibl].qh[ib] |= (l << 12);
12243
11675
  }
12244
11676
  }
12245
11677
  }
12246
11678
 
12247
- size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12248
- (void)hist;
11679
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12249
11680
  GGML_ASSERT(n_per_row%QK_K == 0);
12250
11681
  int nblock = n_per_row/QK_K;
12251
11682
  char * qrow = (char *)dst;
@@ -12270,7 +11701,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
12270
11701
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
12271
11702
  }
12272
11703
 
12273
- static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
11704
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
12274
11705
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
12275
11706
  float * scales, float * weight, uint8_t * L,
12276
11707
  const int8_t * values,
@@ -12378,8 +11809,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
12378
11809
  }
12379
11810
  }
12380
11811
 
12381
- size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12382
- (void)hist;
11812
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12383
11813
  GGML_ASSERT(n_per_row%QK4_NL == 0);
12384
11814
  int nblock = n_per_row/QK4_NL;
12385
11815
  char * qrow = (char *)dst;
@@ -12409,14 +11839,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12409
11839
 
12410
11840
  void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12411
11841
  assert(k % QK4_NL == 0);
12412
- quantize_iq4_nl(x, y, 1, k, NULL, NULL);
11842
+ quantize_iq4_nl(x, y, 1, k, NULL);
12413
11843
  }
12414
11844
 
12415
- size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11845
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12416
11846
  #if QK_K == 64
12417
- return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
11847
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
12418
11848
  #else
12419
- (void)hist;
12420
11849
  GGML_ASSERT(n_per_row%QK_K == 0);
12421
11850
  int nblock = n_per_row/QK_K;
12422
11851
  char * qrow = (char *)dst;
@@ -12445,7 +11874,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12445
11874
 
12446
11875
  void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12447
11876
  assert(k % QK_K == 0);
12448
- quantize_iq4_xs(x, y, 1, k, NULL, NULL);
11877
+ quantize_iq4_xs(x, y, 1, k, NULL);
12449
11878
  }
12450
11879
 
12451
11880
  // =============================== 2.5625 bpw
@@ -12618,8 +12047,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12618
12047
  }
12619
12048
  }
12620
12049
 
12621
- size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12622
- (void)hist;
12050
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12623
12051
  GGML_ASSERT(n_per_row%QK_K == 0);
12624
12052
  int nblock = n_per_row/QK_K;
12625
12053
  char * qrow = (char *)dst;
@@ -12633,7 +12061,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
12633
12061
 
12634
12062
  void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12635
12063
  assert(k % QK_K == 0);
12636
- quantize_iq2_s(x, y, 1, k, NULL, NULL);
12064
+ quantize_iq2_s(x, y, 1, k, NULL);
12637
12065
  }
12638
12066
 
12639
12067
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {