llama_cpp 0.14.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,12 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+
1
4
  #include "ggml-quants.h"
2
5
  #include "ggml-impl.h"
3
6
 
7
+ #define GGML_COMMON_IMPL_C
8
+ #include "ggml-common.h"
9
+
4
10
  #include <math.h>
5
11
  #include <string.h>
6
12
  #include <assert.h>
@@ -948,7 +954,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
948
954
  const float d = amax / ((1 << 7) - 1);
949
955
  const float id = d ? 1.0f/d : 0.0f;
950
956
 
951
- y[i].d = d;
957
+ y[i].d = GGML_FP32_TO_FP16(d);
952
958
 
953
959
  int sum = 0;
954
960
 
@@ -963,7 +969,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
963
969
  sum += y[i].qs[QK8_1/2 + j];
964
970
  }
965
971
 
966
- y[i].s = sum*d;
972
+ y[i].s = GGML_FP32_TO_FP16(sum*d);
967
973
  }
968
974
  }
969
975
 
@@ -991,7 +997,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
991
997
  const float d = amax / ((1 << 7) - 1);
992
998
  const float id = d ? 1.0f/d : 0.0f;
993
999
 
994
- y[i].d = d;
1000
+ y[i].d = GGML_FP32_TO_FP16(d);
995
1001
 
996
1002
  int32x4_t accv = vdupq_n_s32(0);
997
1003
 
@@ -1007,7 +1013,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1007
1013
  accv = vaddq_s32(accv, vi);
1008
1014
  }
1009
1015
 
1010
- y[i].s = d * vaddvq_s32(accv);
1016
+ y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
1011
1017
  }
1012
1018
  #elif defined(__wasm_simd128__)
1013
1019
  for (int i = 0; i < nb; i++) {
@@ -1030,7 +1036,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1030
1036
  const float d = amax / ((1 << 7) - 1);
1031
1037
  const float id = d ? 1.0f/d : 0.0f;
1032
1038
 
1033
- y[i].d = d;
1039
+ y[i].d = GGML_FP32_TO_FP16(d);
1034
1040
 
1035
1041
  v128_t accv = wasm_i32x4_splat(0);
1036
1042
 
@@ -1046,10 +1052,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1046
1052
  accv = wasm_i32x4_add(accv, vi);
1047
1053
  }
1048
1054
 
1049
- y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
1050
- wasm_i32x4_extract_lane(accv, 1) +
1051
- wasm_i32x4_extract_lane(accv, 2) +
1052
- wasm_i32x4_extract_lane(accv, 3));
1055
+ y[i].s = GGML_FP32_TO_FP16(
1056
+ d * (wasm_i32x4_extract_lane(accv, 0) +
1057
+ wasm_i32x4_extract_lane(accv, 1) +
1058
+ wasm_i32x4_extract_lane(accv, 2) +
1059
+ wasm_i32x4_extract_lane(accv, 3)));
1053
1060
  }
1054
1061
  #elif defined(__AVX2__) || defined(__AVX__)
1055
1062
  for (int i = 0; i < nb; i++) {
@@ -1074,7 +1081,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1074
1081
 
1075
1082
  // Quantize these floats
1076
1083
  const float d = maxScalar / 127.f;
1077
- y[i].d = d;
1084
+ y[i].d = GGML_FP32_TO_FP16(d);
1078
1085
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1079
1086
  const __m256 mul = _mm256_set1_ps( id );
1080
1087
 
@@ -1098,7 +1105,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1098
1105
 
1099
1106
  #if defined(__AVX2__)
1100
1107
  // Compute the sum of the quants and set y[i].s
1101
- y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
1108
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
1102
1109
 
1103
1110
  // Convert int32 to int16
1104
1111
  i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
@@ -1128,7 +1135,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1128
1135
  // Compute the sum of the quants and set y[i].s
1129
1136
  const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
1130
1137
  const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
1131
- y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
1138
+ y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
1132
1139
 
1133
1140
  // Convert int32 to int16
1134
1141
  ni0 = _mm_packs_epi32( ni0, ni1 );
@@ -1159,7 +1166,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1159
1166
  const float d = amax / ((1 << 7) - 1);
1160
1167
  const float id = d ? 1.0f/d : 0.0f;
1161
1168
 
1162
- y[i].d = d;
1169
+ y[i].d = GGML_FP32_TO_FP16(d);
1163
1170
 
1164
1171
  vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1165
1172
 
@@ -1176,7 +1183,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1176
1183
 
1177
1184
  // set y[i].s
1178
1185
  int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
1179
- y[i].s = sum*d;
1186
+ y[i].s = GGML_FP32_TO_FP16(sum*d);
1180
1187
  }
1181
1188
  #else
1182
1189
  GGML_UNUSED(nb);
@@ -1701,16 +1708,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1701
1708
  quantize_row_q2_K_reference(x, vy, k);
1702
1709
  }
1703
1710
 
1704
- size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1705
- (void)hist; // TODO: collect histograms
1706
-
1707
- for (int j = 0; j < n; j += k) {
1708
- block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
1709
- quantize_row_q2_K_reference(src + j, y, k);
1710
- }
1711
- return (n/QK_K*sizeof(block_q2_K));
1712
- }
1713
-
1714
1711
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1715
1712
  uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1716
1713
  float rmin, float rdelta, int nstep, bool use_mad) {
@@ -1963,8 +1960,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1963
1960
  }
1964
1961
  }
1965
1962
 
1966
- size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1967
- (void)hist;
1963
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
1968
1964
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1969
1965
  if (!quant_weights) {
1970
1966
  quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
@@ -2183,16 +2179,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2183
2179
  quantize_row_q3_K_reference(x, vy, k);
2184
2180
  }
2185
2181
 
2186
- size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2187
- (void)hist; // TODO: collect histograms
2188
-
2189
- for (int j = 0; j < n; j += k) {
2190
- block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
2191
- quantize_row_q3_K_reference(src + j, y, k);
2192
- }
2193
- return (n/QK_K*sizeof(block_q3_K));
2194
- }
2195
-
2196
2182
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2197
2183
  #if QK_K != 256
2198
2184
  (void)quant_weights;
@@ -2282,8 +2268,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2282
2268
  #endif
2283
2269
  }
2284
2270
 
2285
- size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2286
- (void)hist;
2271
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2287
2272
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2288
2273
  if (!quant_weights) {
2289
2274
  quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
@@ -2453,17 +2438,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2453
2438
  quantize_row_q4_K_reference(x, y, k);
2454
2439
  }
2455
2440
 
2456
- size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2457
- assert(k % QK_K == 0);
2458
- (void)hist; // TODO: collect histograms
2459
-
2460
- for (int j = 0; j < n; j += k) {
2461
- block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
2462
- quantize_row_q4_K_reference(src + j, y, k);
2463
- }
2464
- return (n/QK_K*sizeof(block_q4_K));
2465
- }
2466
-
2467
2441
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2468
2442
  #if QK_K != 256
2469
2443
  (void)quant_weights;
@@ -2542,8 +2516,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2542
2516
  #endif
2543
2517
  }
2544
2518
 
2545
- size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2546
- (void)hist;
2519
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2547
2520
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2548
2521
  if (!quant_weights) {
2549
2522
  quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
@@ -2754,17 +2727,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2754
2727
  quantize_row_q5_K_reference(x, y, k);
2755
2728
  }
2756
2729
 
2757
- size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2758
- assert(k % QK_K == 0);
2759
- (void)hist; // TODO: collect histograms
2760
-
2761
- for (int j = 0; j < n; j += k) {
2762
- block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
2763
- quantize_row_q5_K_reference(src + j, y, k);
2764
- }
2765
- return (n/QK_K*sizeof(block_q5_K));
2766
- }
2767
-
2768
2730
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2769
2731
  #if QK_K != 256
2770
2732
  (void)quant_weights;
@@ -2863,8 +2825,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2863
2825
  #endif
2864
2826
  }
2865
2827
 
2866
- size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2867
- (void)hist;
2828
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
2868
2829
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2869
2830
  if (!quant_weights) {
2870
2831
  quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
@@ -3017,17 +2978,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
3017
2978
  quantize_row_q6_K_reference(x, y, k);
3018
2979
  }
3019
2980
 
3020
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
3021
- assert(k % QK_K == 0);
3022
- (void)hist; // TODO: collect histograms
3023
-
3024
- for (int j = 0; j < n; j += k) {
3025
- block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
3026
- quantize_row_q6_K_reference(src + j, y, k);
3027
- }
3028
- return (n/QK_K*sizeof(block_q6_K));
3029
- }
3030
-
3031
2981
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
3032
2982
  #if QK_K != 256
3033
2983
  (void)quant_weights;
@@ -3117,8 +3067,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3117
3067
  #endif
3118
3068
  }
3119
3069
 
3120
- size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3121
- (void)hist;
3070
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3122
3071
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3123
3072
  if (!quant_weights) {
3124
3073
  quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
@@ -3162,9 +3111,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3162
3111
  }
3163
3112
  }
3164
3113
 
3165
- size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3114
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3166
3115
  if (!quant_weights) {
3167
- return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3116
+ quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3117
+ return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3168
3118
  }
3169
3119
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3170
3120
  char * qrow = (char *)dst;
@@ -3206,9 +3156,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3206
3156
  }
3207
3157
  }
3208
3158
 
3209
- size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3159
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3210
3160
  if (!quant_weights) {
3211
- return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3161
+ quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3162
+ return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3212
3163
  }
3213
3164
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3214
3165
  char * qrow = (char *)dst;
@@ -3259,9 +3210,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3259
3210
  }
3260
3211
  }
3261
3212
 
3262
- size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3213
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3263
3214
  if (!quant_weights) {
3264
- return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3215
+ quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3216
+ return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3265
3217
  }
3266
3218
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3267
3219
  char * qrow = (char *)dst;
@@ -3311,9 +3263,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3311
3263
  }
3312
3264
  }
3313
3265
 
3314
- size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3266
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3315
3267
  if (!quant_weights) {
3316
- return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3268
+ quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3269
+ return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3317
3270
  }
3318
3271
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3319
3272
  char * qrow = (char *)dst;
@@ -3325,712 +3278,14 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
3325
3278
  return nrow * row_size;
3326
3279
  }
3327
3280
 
3328
- // ====================== "True" 2-bit (de)-quantization
3329
-
3330
- static const uint64_t iq2xxs_grid[256] = {
3331
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3332
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
3333
- 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
3334
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
3335
- 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
3336
- 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
3337
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
3338
- 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
3339
- 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
3340
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
3341
- 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
3342
- 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
3343
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
3344
- 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
3345
- 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
3346
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
3347
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
3348
- 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
3349
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
3350
- 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
3351
- 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
3352
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
3353
- 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
3354
- 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
3355
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
3356
- 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
3357
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
3358
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
3359
- 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
3360
- 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
3361
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
3362
- 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
3363
- 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
3364
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
3365
- 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
3366
- 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
3367
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
3368
- 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
3369
- 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
3370
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
3371
- 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
3372
- 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
3373
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
3374
- 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
3375
- 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
3376
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
3377
- 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
3378
- 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
3379
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
3380
- 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
3381
- 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
3382
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
3383
- 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
3384
- 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
3385
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
3386
- 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
3387
- 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
3388
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
3389
- 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
3390
- 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
3391
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
3392
- 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
3393
- 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
3394
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
3395
- };
3396
-
3397
- static const uint64_t iq2xs_grid[512] = {
3398
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3399
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3400
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3401
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3402
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3403
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
3404
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
3405
- 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
3406
- 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
3407
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
3408
- 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
3409
- 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
3410
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
3411
- 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
3412
- 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
3413
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
3414
- 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
3415
- 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
3416
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
3417
- 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
3418
- 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
3419
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
3420
- 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
3421
- 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
3422
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
3423
- 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
3424
- 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
3425
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
3426
- 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
3427
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
3428
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
3429
- 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
3430
- 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
3431
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
3432
- 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
3433
- 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
3434
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
3435
- 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
3436
- 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
3437
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
3438
- 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
3439
- 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
3440
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
3441
- 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
3442
- 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
3443
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
3444
- 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
3445
- 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
3446
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
3447
- 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
3448
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
3449
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
3450
- 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
3451
- 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
3452
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
3453
- 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
3454
- 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
3455
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
3456
- 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
3457
- 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
3458
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
3459
- 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
3460
- 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
3461
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
3462
- 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
3463
- 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
3464
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
3465
- 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
3466
- 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
3467
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
3468
- 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
3469
- 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
3470
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
3471
- 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
3472
- 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
3473
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
3474
- 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
3475
- 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
3476
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
3477
- 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
3478
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
3479
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
3480
- 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
3481
- 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
3482
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
3483
- 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
3484
- 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
3485
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
3486
- 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
3487
- 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
3488
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
3489
- 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
3490
- 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
3491
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
3492
- 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
3493
- 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
3494
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
3495
- 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
3496
- 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
3497
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
3498
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
3499
- 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
3500
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
3501
- 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
3502
- 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
3503
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
3504
- 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
3505
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
3506
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
3507
- 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
3508
- 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
3509
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
3510
- 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
3511
- 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
3512
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
3513
- 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
3514
- 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
3515
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
3516
- 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
3517
- 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
3518
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
3519
- 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
3520
- 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
3521
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
3522
- 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
3523
- 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
3524
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
3525
- 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3526
- };
3527
-
3528
- static const uint64_t iq2s_grid[1024] = {
3529
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3530
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3531
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3532
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3533
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3534
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3535
- 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3536
- 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3537
- 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3538
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3539
- 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3540
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3541
- 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3542
- 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3543
- 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3544
- 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3545
- 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3546
- 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3547
- 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3548
- 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3549
- 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3550
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3551
- 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3552
- 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3553
- 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3554
- 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3555
- 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3556
- 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3557
- 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3558
- 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3559
- 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3560
- 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3561
- 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3562
- 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3563
- 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3564
- 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3565
- 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3566
- 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3567
- 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3568
- 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3569
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3570
- 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3571
- 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3572
- 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3573
- 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3574
- 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3575
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3576
- 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3577
- 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3578
- 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3579
- 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3580
- 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3581
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3582
- 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3583
- 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3584
- 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3585
- 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3586
- 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3587
- 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3588
- 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3589
- 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3590
- 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3591
- 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3592
- 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3593
- 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3594
- 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3595
- 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3596
- 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3597
- 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3598
- 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3599
- 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3600
- 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3601
- 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3602
- 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3603
- 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3604
- 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3605
- 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3606
- 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3607
- 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3608
- 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3609
- 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3610
- 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3611
- 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3612
- 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3613
- 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3614
- 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3615
- 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3616
- 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3617
- 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3618
- 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3619
- 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3620
- 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3621
- 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3622
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3623
- 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3624
- 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3625
- 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3626
- 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3627
- 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3628
- 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3629
- 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3630
- 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3631
- 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3632
- 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3633
- 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3634
- 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3635
- 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3636
- 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3637
- 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3638
- 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3639
- 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3640
- 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3641
- 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3642
- 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3643
- 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3644
- 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3645
- 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3646
- 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3647
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3648
- 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3649
- 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3650
- 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3651
- 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3652
- 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3653
- 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3654
- 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3655
- 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3656
- 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3657
- 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3658
- 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3659
- 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3660
- 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3661
- 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3662
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3663
- 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3664
- 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3665
- 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3666
- 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3667
- 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3668
- 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3669
- 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3670
- 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3671
- 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3672
- 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3673
- 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3674
- 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3675
- 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3676
- 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3677
- 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3678
- 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3679
- 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3680
- 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3681
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3682
- 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3683
- 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3684
- 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3685
- 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3686
- 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3687
- 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3688
- 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3689
- 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3690
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3691
- 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3692
- 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3693
- 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3694
- 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3695
- 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3696
- 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3697
- 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3698
- 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3699
- 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3700
- 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3701
- 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3702
- 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3703
- 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3704
- 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3705
- 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3706
- 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3707
- 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3708
- 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3709
- 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3710
- 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3711
- 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3712
- 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3713
- 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3714
- 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3715
- 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3716
- 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3717
- 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3718
- 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3719
- 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3720
- 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3721
- 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3722
- 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3723
- 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3724
- 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3725
- 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3726
- 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3727
- 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3728
- 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3729
- 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3730
- 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3731
- 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
3732
- 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
3733
- 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
3734
- 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
3735
- 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
3736
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
3737
- 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
3738
- 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
3739
- 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
3740
- 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
3741
- 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
3742
- 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
3743
- 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
3744
- 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
3745
- 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
3746
- 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
3747
- 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
3748
- 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
3749
- 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
3750
- 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
3751
- 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
3752
- 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
3753
- 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
3754
- 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
3755
- 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
3756
- 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
3757
- 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
3758
- 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
3759
- 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
3760
- 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
3761
- 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
3762
- 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
3763
- 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
3764
- 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
3765
- 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
3766
- 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
3767
- 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
3768
- 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
3769
- 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
3770
- 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
3771
- 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
3772
- 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
3773
- 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
3774
- 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
3775
- 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
3776
- 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
3777
- 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
3778
- 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
3779
- 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
3780
- 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
3781
- 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
3782
- 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
3783
- 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
3784
- 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
3785
- };
3786
-
3787
- static const uint32_t iq3xxs_grid[256] = {
3788
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3789
- 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
3790
- 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
3791
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
3792
- 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
3793
- 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
3794
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
3795
- 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
3796
- 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
3797
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
3798
- 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
3799
- 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
3800
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
3801
- 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
3802
- 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
3803
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
3804
- 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
3805
- 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
3806
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
3807
- 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
3808
- 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
3809
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
3810
- 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
3811
- 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
3812
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
3813
- 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
3814
- 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
3815
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
3816
- 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
3817
- 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
3818
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
3819
- 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3820
- };
3821
-
3822
- static const uint32_t iq3s_grid[512] = {
3823
- 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
3824
- 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
3825
- 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
3826
- 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
3827
- 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
3828
- 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
3829
- 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
3830
- 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
3831
- 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
3832
- 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
3833
- 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
3834
- 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
3835
- 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
3836
- 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
3837
- 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
3838
- 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
3839
- 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
3840
- 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
3841
- 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
3842
- 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
3843
- 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
3844
- 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
3845
- 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
3846
- 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
3847
- 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
3848
- 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
3849
- 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
3850
- 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
3851
- 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
3852
- 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
3853
- 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
3854
- 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
3855
- 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
3856
- 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
3857
- 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
3858
- 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
3859
- 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
3860
- 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
3861
- 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
3862
- 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
3863
- 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
3864
- 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
3865
- 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
3866
- 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
3867
- 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
3868
- 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
3869
- 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
3870
- 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
3871
- 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
3872
- 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
3873
- 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
3874
- 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
3875
- 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
3876
- 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
3877
- 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
3878
- 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
3879
- 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
3880
- 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
3881
- 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
3882
- 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
3883
- 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
3884
- 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
3885
- 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
3886
- 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
3887
- };
3888
-
3889
- #define NGRID_IQ2XXS 512
3890
- static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
3891
- 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
3892
- 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
3893
- 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
3894
- 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
3895
- 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
3896
- 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
3897
- 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
3898
- 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
3899
- 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
3900
- 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
3901
- 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
3902
- 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
3903
- 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
3904
- 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
3905
- 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
3906
- 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
3907
- 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
3908
- 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
3909
- 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
3910
- 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
3911
- 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
3912
- 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
3913
- 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
3914
- 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
3915
- 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
3916
- 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
3917
- 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
3918
- 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
3919
- 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
3920
- 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
3921
- 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
3922
- 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
3923
- 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
3924
- 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
3925
- 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
3926
- 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
3927
- 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
3928
- 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
3929
- 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
3930
- 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
3931
- 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
3932
- 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
3933
- 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
3934
- 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
3935
- 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
3936
- 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
3937
- 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
3938
- 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
3939
- 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
3940
- 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
3941
- 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
3942
- 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
3943
- 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
3944
- 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
3945
- 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
3946
- 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
3947
- 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
3948
- 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
3949
- 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
3950
- 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
3951
- 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
3952
- 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
3953
- 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
3954
- 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
3955
- 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
3956
- 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
3957
- 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
3958
- 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
3959
- 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
3960
- 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
3961
- 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
3962
- 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
3963
- 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
3964
- 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
3965
- 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
3966
- 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
3967
- 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
3968
- 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
3969
- 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
3970
- 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
3971
- 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
3972
- 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
3973
- 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
3974
- 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
3975
- 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
3976
- 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
3977
- 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
3978
- 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
3979
- 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
3980
- 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
3981
- 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
3982
- 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
3983
- 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
3984
- 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
3985
- 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
3986
- 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
3987
- 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
3988
- 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
3989
- 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
3990
- 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
3991
- 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
3992
- 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
3993
- 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
3994
- 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
3995
- 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
3996
- 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
3997
- 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
3998
- 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
3999
- 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
4000
- 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
4001
- 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
4002
- 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
4003
- 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
4004
- 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
4005
- 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
4006
- 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
4007
- 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
4008
- 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
4009
- 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
4010
- 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
4011
- 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
4012
- 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
4013
- 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
4014
- 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
4015
- 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
4016
- 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
4017
- 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
4018
- 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
4019
-
4020
- };
4021
-
4022
- static const uint8_t ksigns_iq2xs[128] = {
4023
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
4024
- 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
4025
- 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
4026
- 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
4027
- 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
4028
- 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
4029
- 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
4030
- 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
4031
- };
3281
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3282
+ (void)quant_weights; // not used
3283
+ const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3284
+ quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3285
+ return nrow * row_size;
3286
+ }
4032
3287
 
4033
- static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3288
+ // ====================== "True" 2-bit (de)-quantization
4034
3289
 
4035
3290
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
4036
3291
  assert(k % QK_K == 0);
@@ -4198,39 +3453,23 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
4198
3453
  assert(k % QK_K == 0);
4199
3454
  const int nb = k / QK_K;
4200
3455
 
4201
- float db[4];
4202
- uint16_t idx[4];
4203
- //const int8_t * grid[4];
4204
-
4205
3456
  for (int i = 0; i < nb; i++) {
4206
3457
 
4207
3458
  const float d = GGML_FP16_TO_FP32(x[i].d);
4208
- const uint8_t * sc = x[i].scales;
4209
- const uint8_t * qs = x[i].qs;
3459
+ const uint8_t * qs = x[i].qs;
3460
+ const uint16_t * qh = x[i].qh;
4210
3461
 
4211
- for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
4212
- idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
4213
- idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
4214
- idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
4215
- idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
4216
- //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
4217
- //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
4218
- //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
4219
- //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
4220
- db[0] = d * (2*(sc[0] & 7) + 1);
4221
- db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
4222
- db[2] = d * (2*(sc[1] & 7) + 1);
4223
- db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
3462
+ for (int ib = 0; ib < QK_K/32; ++ib) {
3463
+ const float dl = d * (2*((qh[ib] >> 12) & 7) + 1);
3464
+ const float delta = qh[ib] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
4224
3465
  for (int l = 0; l < 4; ++l) {
4225
- const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3466
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
4226
3467
  for (int j = 0; j < 8; ++j) {
4227
- //y[j] = db[l] * grid[l][j];
4228
- y[j] = db[l] * grid[j];
3468
+ y[j] = dl * (grid[j] + delta);
4229
3469
  }
4230
3470
  y += 8;
4231
3471
  }
4232
3472
  qs += 4;
4233
- sc += 2;
4234
3473
  }
4235
3474
  }
4236
3475
  }
@@ -4784,10 +4023,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4784
4023
  const block_q8_1 * restrict b_y0 = &vy0[i];
4785
4024
  const block_q8_1 * restrict b_y1 = &vy1[i];
4786
4025
 
4787
- float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
4788
- GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
4789
- GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
4790
- GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
4026
+ float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
4027
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
4028
+ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
4029
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
4791
4030
  summs0 += summs_t;
4792
4031
 
4793
4032
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
@@ -4808,10 +4047,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4808
4047
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4809
4048
 
4810
4049
  // mmla into int32x4_t
4811
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4812
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4813
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4814
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4050
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
4051
+ GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
4052
+ GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
4053
+ GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
4815
4054
 
4816
4055
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4817
4056
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -4852,7 +4091,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4852
4091
  const block_q8_1 * restrict y0 = &y[i + 0];
4853
4092
  const block_q8_1 * restrict y1 = &y[i + 1];
4854
4093
 
4855
- summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
4094
+ summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
4856
4095
 
4857
4096
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
4858
4097
 
@@ -4875,8 +4114,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4875
4114
  const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
4876
4115
  const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
4877
4116
 
4878
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
4879
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
4117
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
4118
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
4880
4119
  }
4881
4120
 
4882
4121
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@@ -4889,9 +4128,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4889
4128
  // Main loop
4890
4129
  for (int i = 0; i < nb; ++i) {
4891
4130
  const float d0 = GGML_FP16_TO_FP32(x[i].d);
4892
- const float d1 = y[i].d;
4131
+ const float d1 = GGML_FP16_TO_FP32(y[i].d);
4893
4132
 
4894
- summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4133
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
4895
4134
 
4896
4135
  const __m256 d0v = _mm256_set1_ps( d0 );
4897
4136
  const __m256 d1v = _mm256_set1_ps( d1 );
@@ -4943,7 +4182,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4943
4182
 
4944
4183
  int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
4945
4184
 
4946
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4185
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
4947
4186
  }
4948
4187
 
4949
4188
  *s = sumf;
@@ -4961,7 +4200,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4961
4200
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
4962
4201
  }
4963
4202
 
4964
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4203
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
4965
4204
  }
4966
4205
 
4967
4206
  *s = sumf;
@@ -5297,8 +4536,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5297
4536
 
5298
4537
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
5299
4538
 
5300
- summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
5301
- summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
4539
+ summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
4540
+ summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
5302
4541
 
5303
4542
  // extract the 5th bit via lookup table ((b) << 4)
5304
4543
  memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -5342,10 +4581,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5342
4581
 
5343
4582
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
5344
4583
  ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
5345
- ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
4584
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
5346
4585
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
5347
4586
  ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
5348
- ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
4587
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
5349
4588
  }
5350
4589
 
5351
4590
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@@ -5362,7 +4601,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5362
4601
  const block_q5_1 * restrict x0 = &x[i];
5363
4602
  const block_q8_1 * restrict y0 = &y[i];
5364
4603
 
5365
- summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
4604
+ summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
5366
4605
 
5367
4606
  const v128_t m4b = wasm_i8x16_splat(0x0F);
5368
4607
 
@@ -5409,7 +4648,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5409
4648
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
5410
4649
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
5411
4650
  wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
5412
- wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
4651
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
5413
4652
  }
5414
4653
 
5415
4654
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -5424,14 +4663,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5424
4663
  for (int i = 0; i < nb; i++) {
5425
4664
  const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
5426
4665
 
5427
- summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4666
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
5428
4667
 
5429
4668
  __m256i qx = bytes_from_nibbles_32(x[i].qs);
5430
4669
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
5431
4670
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
5432
4671
  qx = _mm256_or_si256(qx, bxhi);
5433
4672
 
5434
- const __m256 dy = _mm256_set1_ps(y[i].d);
4673
+ const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
5435
4674
  const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
5436
4675
 
5437
4676
  const __m256 q = mul_sum_us8_pairs_float(qx, qy);
@@ -5451,7 +4690,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5451
4690
  for (int i = 0; i < nb; i++) {
5452
4691
  const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
5453
4692
 
5454
- summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4693
+ summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
5455
4694
 
5456
4695
  __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
5457
4696
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -5465,7 +4704,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5465
4704
  bxh = _mm_or_si128(bxh, bxhih);
5466
4705
  bx_0 = MM256_SET_M128I(bxh, bxl);
5467
4706
 
5468
- const __m256 dy = _mm256_set1_ps(y[i].d);
4707
+ const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
5469
4708
  const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
5470
4709
 
5471
4710
  const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
@@ -5532,7 +4771,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5532
4771
 
5533
4772
  int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
5534
4773
 
5535
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4774
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
5536
4775
  }
5537
4776
 
5538
4777
  *s = sumf;
@@ -5556,7 +4795,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5556
4795
  sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
5557
4796
  }
5558
4797
 
5559
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
4798
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
5560
4799
  }
5561
4800
 
5562
4801
  *s = sumf;
@@ -9758,8 +8997,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9758
8997
 
9759
8998
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9760
8999
 
9761
- const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
9762
- const uint8x16_t mask2 = vld1q_u8(k_mask2);
9000
+ const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
9001
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9763
9002
  const uint8x16_t m1 = vdupq_n_u8(1);
9764
9003
  const int32x4_t vzero = vdupq_n_s32(0);
9765
9004
 
@@ -9790,7 +9029,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9790
9029
  vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
9791
9030
  qs += 8;
9792
9031
 
9793
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9032
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
9794
9033
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9795
9034
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9796
9035
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
@@ -9799,7 +9038,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9799
9038
  q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
9800
9039
  q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
9801
9040
 
9802
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9041
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
9803
9042
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
9804
9043
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
9805
9044
  vs.val[0] = vceqq_u8(vs.val[0], mask2);
@@ -9870,12 +9109,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9870
9109
  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9871
9110
  qs += 8;
9872
9111
 
9873
- __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
9112
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
9874
9113
  aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9875
9114
  const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
9876
9115
  const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
9877
9116
 
9878
- aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
9117
+ aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
9879
9118
  aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
9880
9119
  const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
9881
9120
  const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
@@ -10075,7 +9314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10075
9314
  #endif
10076
9315
  }
10077
9316
 
10078
- void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9317
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10079
9318
  assert(n % QK_K == 0);
10080
9319
  assert(nrc == 1);
10081
9320
  UNUSED(nrc);
@@ -10103,11 +9342,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10103
9342
 
10104
9343
  static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
10105
9344
 
10106
- const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
10107
- const uint8x16_t mask2 = vld1q_u8(k_mask2);
10108
- const int16x8_t hshift = vld1q_s16(k_shift);
10109
- const uint16x8_t m256 = vdupq_n_u16(256);
10110
- const uint8x16_t m1 = vdupq_n_u8(1);
9345
+ const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
9346
+ const uint8x16_t mask2 = vld1q_u8(k_mask2);
9347
+
9348
+ const int16x8_t hshift = vld1q_s16(k_shift);
9349
+ const uint16x8_t m256 = vdupq_n_u16(256);
9350
+ const uint8x16_t m1 = vdupq_n_u8(1);
10111
9351
 
10112
9352
  uint8x16x2_t vs;
10113
9353
  ggml_int8x16x4_t q3s;
@@ -10139,18 +9379,18 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10139
9379
 
10140
9380
  const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
10141
9381
  idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
10142
- const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
10143
- iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
10144
- const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
10145
- iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
9382
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9383
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
9384
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9385
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
10146
9386
  idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
10147
- const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
10148
- iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
10149
- const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
10150
- iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
9387
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9388
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
9389
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9390
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
10151
9391
 
10152
9392
 
10153
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
9393
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
10154
9394
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10155
9395
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10156
9396
  vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
@@ -10159,7 +9399,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
10159
9399
  q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
10160
9400
  q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
10161
9401
 
10162
- vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
9402
+ vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
10163
9403
  vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
10164
9404
  vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
10165
9405
  vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
@@ -10322,7 +9562,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10322
9562
  }
10323
9563
  #endif
10324
9564
 
10325
- void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9565
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
10326
9566
  assert(n % QK_K == 0);
10327
9567
  assert(nrc == 1);
10328
9568
  UNUSED(nrc);
@@ -10335,155 +9575,119 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
10335
9575
 
10336
9576
  const int nb = n / QK_K;
10337
9577
 
10338
- // TODO: implement for QK_K = 64
10339
- #if defined __ARM_NEON && QK_K == 256
10340
-
10341
- const uint8x16_t m8 = vdupq_n_u8(0x08);
10342
- const uint8x16_t m7 = vdupq_n_u8(0x07);
10343
- const uint8x16_t m1 = vdupq_n_u8(0x01);
10344
- const int32x4_t vzero = vdupq_n_s32(0);
9578
+ #if defined __ARM_NEON
10345
9579
 
10346
- uint16_t gindex[8];
10347
- uint16x8x2_t vindex;
10348
- int8x16x4_t q1b;
9580
+ ggml_int8x16x4_t q1b;
10349
9581
  ggml_int8x16x4_t q8b;
10350
- uint16x8x4_t scales;
10351
- int32x4x2_t sumi;
10352
- int32x4x2_t dotq;
10353
9582
 
10354
9583
  float sumf = 0;
10355
9584
  for (int i = 0; i < nb; ++i) {
10356
9585
 
10357
- const int8_t * q8 = y[i].qs;
10358
- const uint8_t * qs = x[i].qs;
10359
- const uint8_t * sc = x[i].scales;
9586
+ const int8_t * q8 = y[i].qs;
9587
+ const uint8_t * qs = x[i].qs;
9588
+ const uint16_t * qh = x[i].qh;
9589
+
9590
+ int sumi1 = 0, sumi2 = 0, sumi3 = 0;
10360
9591
 
10361
- sumi.val[0] = sumi.val[1] = vzero;
9592
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
10362
9593
 
10363
- for (int i128 = 0; i128 < QK_K/128; ++i128) {
10364
- const uint8x16_t ql = vld1q_u8(qs); qs += 16;
10365
- const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
10366
- const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
10367
- const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
10368
- const uint8x16_t hbit = vandq_u8(qh, m8);
10369
- vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
10370
- vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
10371
- const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
10372
- scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
10373
- scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
9594
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
9595
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
9596
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
9597
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
9598
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
9599
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
9600
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
9601
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
9602
+ qs += 8;
10374
9603
 
10375
- for (int l = 0; l < 2; ++l) {
10376
- vst1q_u16(gindex+0, vindex.val[l]);
10377
- q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
10378
- q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
10379
- q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
10380
- q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
10381
- q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9604
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
10382
9605
 
10383
- dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
10384
- dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
9606
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
9607
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
9608
+
9609
+ const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
9610
+ const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
9611
+ sumi1 += vaddvq_s32(p1) * ls1;
9612
+ sumi2 += vaddvq_s32(p2) * ls2;
9613
+ sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
9614
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
10385
9615
 
10386
- sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
10387
- sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
10388
- }
10389
9616
  }
10390
9617
 
10391
- sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
9618
+ sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
10392
9619
  }
10393
9620
 
10394
9621
  *s = sumf;
10395
9622
 
10396
- // TODO: implement for QK_K = 64
10397
- #elif defined __AVX2__ && QK_K == 256
10398
-
10399
- const __m128i m8 = _mm_set1_epi8(0x08);
10400
- const __m128i m7 = _mm_set1_epi8(0x07);
10401
- const __m128i m1 = _mm_set1_epi8(0x01);
10402
- const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
10403
- const __m128i shuffle_s[4] = {
10404
- _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
10405
- _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
10406
- _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
10407
- _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
10408
- };
10409
-
10410
- uint64_t aux64;
10411
-
10412
- typedef union m256i_uint16 {
10413
- __m256i reg;
10414
- uint16_t s[16];
10415
- } m256i_uint16_t;
10416
-
10417
- m256i_uint16_t v_gindex;
9623
+ #elif defined __AVX2__
10418
9624
 
10419
9625
  __m256 accum = _mm256_setzero_ps();
9626
+ float accum1 = 0;
10420
9627
  for (int i = 0; i < nb; ++i) {
10421
9628
 
10422
- const int8_t * q8 = y[i].qs;
10423
- const uint8_t * qs = x[i].qs;
10424
- const uint8_t * sc = x[i].scales;
9629
+ const int8_t * q8 = y[i].qs;
9630
+ const uint8_t * qs = x[i].qs;
9631
+ const uint16_t * qh = x[i].qh;
10425
9632
 
10426
9633
  __m256i sumi = _mm256_setzero_si256();
10427
- for (int i128 = 0; i128 < QK_K/128; ++i128) {
10428
- const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
10429
- memcpy(&aux64, sc, 8); sc += 8;
10430
- const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
10431
- const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
10432
- v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
10433
- const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
9634
+ int sumi1 = 0;
9635
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9636
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
9637
+ iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
9638
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
9639
+ iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
9640
+ qs += 8;
9641
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9642
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10434
9643
 
10435
- for (int i32 = 0; i32 < 4; ++i32) {
10436
- const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
10437
- const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
10438
- iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
10439
- const __m256i dot = mul_add_epi8(q1b, q8b);
10440
- const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
10441
- const __m256i p = _mm256_madd_epi16(s16, dot);
10442
- sumi = _mm256_add_epi32(sumi, p);
10443
- }
9644
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
9645
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
9646
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
9647
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
9648
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
9649
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
10444
9650
 
9651
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
9652
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
9653
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
10445
9654
  }
10446
9655
 
10447
- accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
9656
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
9657
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
9658
+ accum1 += d * sumi1;
10448
9659
 
10449
9660
  }
10450
9661
 
10451
- *s = hsum_float_8(accum);
9662
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
10452
9663
 
10453
9664
  #else
10454
9665
 
10455
- int db[4];
10456
- uint16_t idx[4];
10457
-
10458
9666
  float sumf = 0;
10459
- for (int i = 0; i < nb; ++i) {
9667
+ for (int i = 0; i < nb; i++) {
10460
9668
 
10461
- const int8_t * q8 = y[i].qs;
10462
- const uint8_t * qs = x[i].qs;
10463
- const uint8_t * sc = x[i].scales;
9669
+ const int8_t * q8 = y[i].qs;
9670
+ const uint8_t * qs = x[i].qs;
9671
+ const uint16_t * qh = x[i].qh;
10464
9672
 
10465
- int sumi = 0;
10466
- for (int i32 = 0; i32 < QK_K/32; ++i32) {
10467
- idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
10468
- idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
10469
- idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
10470
- idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
10471
- db[0] = (2*(sc[0] & 7) + 1);
10472
- db[1] = (2*((sc[0] >> 4) & 7) + 1);
10473
- db[2] = (2*(sc[1] & 7) + 1);
10474
- db[3] = (2*((sc[1] >> 4) & 7) + 1);
9673
+ int sumi = 0, sumi1 = 0;
9674
+ for (int ib = 0; ib < QK_K/32; ++ib) {
9675
+ const int ls = 2*((qh[ib] >> 12) & 7) + 1;
9676
+ const int delta = qh[ib] & 0x8000 ? -1 : 1;
9677
+ int lsum = 0;
10475
9678
  for (int l = 0; l < 4; ++l) {
10476
- const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
10477
- int suml = 0;
10478
- for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
10479
- sumi += db[l] * suml;
9679
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
9680
+ for (int j = 0; j < 8; ++j) {
9681
+ lsum += q8[j] * grid[j];
9682
+ }
10480
9683
  q8 += 8;
10481
9684
  }
9685
+ sumi += ls * lsum;
9686
+ sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
10482
9687
  qs += 4;
10483
- sc += 2;
10484
9688
  }
10485
9689
 
10486
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
9690
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
10487
9691
  }
10488
9692
 
10489
9693
  *s = sumf;
@@ -10744,7 +9948,7 @@ static inline int iq2_grid_size(enum ggml_type type) {
10744
9948
  GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10745
9949
  return type == GGML_TYPE_IQ2_XXS ? 256 :
10746
9950
  type == GGML_TYPE_IQ2_XS ? 512 :
10747
- type == GGML_TYPE_IQ1_S ? 512 : 1024;
9951
+ type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
10748
9952
  }
10749
9953
 
10750
9954
  static int iq2_compare_func(const void * left, const void * right) {
@@ -10811,39 +10015,135 @@ void iq2xs_init_impl(enum ggml_type type) {
10811
10015
  40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
10812
10016
  42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
10813
10017
  };
10814
- static const uint16_t kgrid_1bit_512[512] = {
10815
- 10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545,
10816
- 553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444,
10817
- 1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440,
10818
- 2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422,
10819
- 4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397,
10820
- 5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769,
10821
- 5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788,
10822
- 6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794,
10823
- 9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272,
10824
- 10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
10825
- 16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
10826
- 17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
10827
- 18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
10828
- 20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
10829
- 20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
10830
- 21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
10831
- 21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
10832
- 21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
10833
- 22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
10834
- 23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
10835
- 25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
10836
- 25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
10837
- 26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
10838
- 32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
10839
- 33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
10840
- 34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
10841
- 37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
10842
- 38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
10843
- 38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
10844
- 39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
10845
- 41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
10846
- 42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
10018
+ static const uint16_t kgrid_1bit_2048[NGRID_IQ1S] = {
10019
+ 0, 2, 5, 8, 10, 17, 21, 32, 34, 40, 42, 69, 81, 84, 86, 101,
10020
+ 128, 130, 136, 138, 149, 160, 162, 168, 170, 260, 261, 273, 276, 278, 281, 282,
10021
+ 293, 321, 326, 329, 338, 341, 346, 353, 356, 358, 360, 389, 401, 404, 406, 421,
10022
+ 512, 514, 520, 522, 533, 544, 546, 552, 554, 581, 593, 601, 612, 617, 640, 642,
10023
+ 648, 650, 657, 661, 665, 672, 674, 680, 682, 1041, 1044, 1046, 1061, 1089, 1097, 1109,
10024
+ 1114, 1124, 1125, 1169, 1177, 1189, 1281, 1284, 1285, 1286, 1301, 1304, 1306, 1321, 1344, 1349,
10025
+ 1354, 1360, 1361, 1364, 1365, 1366, 1369, 1376, 1378, 1381, 1384, 1386, 1409, 1425, 1429, 1432,
10026
+ 1434, 1441, 1444, 1445, 1446, 1449, 1556, 1561, 1601, 1604, 1616, 1618, 1621, 1624, 1632, 1633,
10027
+ 1638, 1641, 1669, 1681, 1684, 1689, 2048, 2050, 2056, 2058, 2069, 2080, 2082, 2088, 2090, 2117,
10028
+ 2129, 2134, 2149, 2176, 2178, 2184, 2186, 2197, 2208, 2210, 2216, 2218, 2309, 2321, 2324, 2329,
10029
+ 2340, 2341, 2369, 2384, 2385, 2389, 2401, 2404, 2409, 2449, 2452, 2454, 2457, 2469, 2560, 2562,
10030
+ 2568, 2570, 2581, 2592, 2594, 2600, 2602, 2629, 2641, 2649, 2657, 2661, 2688, 2690, 2693, 2696,
10031
+ 2698, 2709, 2720, 2722, 2728, 2730, 4112, 4113, 4116, 4121, 4132, 4133, 4161, 4164, 4176, 4181,
10032
+ 4184, 4193, 4196, 4197, 4201, 4241, 4244, 4246, 4257, 4261, 4353, 4356, 4358, 4361, 4368, 4370,
10033
+ 4373, 4376, 4385, 4388, 4393, 4421, 4426, 4432, 4433, 4434, 4436, 4437, 4438, 4441, 4448, 4453,
10034
+ 4484, 4498, 4501, 4513, 4516, 4625, 4628, 4630, 4645, 4672, 4678, 4681, 4690, 4693, 4696, 4698,
10035
+ 4708, 4710, 4741, 4753, 4756, 4758, 4773, 5121, 5126, 5129, 5140, 5141, 5144, 5145, 5153, 5158,
10036
+ 5185, 5189, 5190, 5192, 5194, 5201, 5204, 5205, 5206, 5209, 5218, 5221, 5224, 5252, 5257, 5264,
10037
+ 5268, 5269, 5272, 5273, 5274, 5281, 5284, 5285, 5289, 5378, 5381, 5386, 5393, 5396, 5397, 5398,
10038
+ 5401, 5408, 5410, 5413, 5416, 5418, 5441, 5444, 5445, 5446, 5457, 5458, 5460, 5461, 5462, 5465,
10039
+ 5466, 5473, 5476, 5477, 5478, 5481, 5504, 5506, 5508, 5509, 5512, 5514, 5520, 5521, 5524, 5525,
10040
+ 5526, 5529, 5530, 5536, 5538, 5541, 5633, 5636, 5637, 5638, 5653, 5654, 5656, 5658, 5665, 5670,
10041
+ 5696, 5698, 5700, 5701, 5704, 5706, 5713, 5717, 5718, 5720, 5721, 5729, 5732, 5733, 5736, 5737,
10042
+ 5738, 5766, 5770, 5778, 5781, 5796, 5801, 6161, 6166, 6181, 6209, 6212, 6214, 6217, 6224, 6229,
10043
+ 6232, 6234, 6240, 6241, 6244, 6246, 6249, 6277, 6289, 6292, 6309, 6416, 6418, 6421, 6426, 6433,
10044
+ 6437, 6466, 6468, 6469, 6472, 6481, 6484, 6485, 6486, 6489, 6490, 6496, 6501, 6506, 6537, 6545,
10045
+ 6546, 6549, 6552, 6561, 6566, 6569, 6665, 6678, 6692, 6694, 6724, 6726, 6729, 6736, 6738, 6741,
10046
+ 6744, 6753, 6758, 6761, 6789, 6801, 6806, 6810, 8192, 8194, 8200, 8202, 8213, 8224, 8226, 8229,
10047
+ 8232, 8234, 8261, 8273, 8281, 8289, 8293, 8320, 8322, 8328, 8330, 8341, 8352, 8354, 8357, 8360,
10048
+ 8362, 8453, 8465, 8468, 8473, 8485, 8514, 8516, 8521, 8533, 8536, 8538, 8545, 8548, 8549, 8550,
10049
+ 8581, 8592, 8598, 8601, 8613, 8705, 8712, 8714, 8721, 8725, 8736, 8738, 8744, 8746, 8773, 8785,
10050
+ 8790, 8793, 8805, 8833, 8840, 8842, 8849, 8853, 8864, 8866, 8872, 8874, 9221, 9236, 9238, 9241,
10051
+ 9253, 9284, 9285, 9286, 9289, 9298, 9301, 9304, 9306, 9318, 9349, 9361, 9364, 9369, 9377, 9381,
10052
+ 9481, 9493, 9505, 9513, 9536, 9541, 9544, 9553, 9556, 9557, 9561, 9570, 9573, 9576, 9609, 9616,
10053
+ 9620, 9621, 9624, 9626, 9633, 9636, 9638, 9641, 9733, 9744, 9746, 9753, 9765, 9793, 9801, 9813,
10054
+ 9824, 9825, 9833, 9860, 9862, 9872, 9882, 10240, 10242, 10248, 10250, 10261, 10272, 10274, 10280, 10282,
10055
+ 10309, 10321, 10324, 10341, 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410, 10505, 10513, 10516, 10521,
10056
+ 10533, 10566, 10569, 10578, 10581, 10593, 10596, 10598, 10601, 10629, 10640, 10646, 10649, 10660, 10661, 10752,
10057
+ 10754, 10760, 10762, 10784, 10786, 10792, 10794, 10821, 10833, 10838, 10841, 10853, 10880, 10882, 10888, 10890,
10058
+ 10901, 10912, 10914, 10920, 10922, 16389, 16401, 16406, 16421, 16457, 16466, 16469, 16472, 16474, 16481, 16484,
10059
+ 16486, 16532, 16537, 16545, 16550, 16640, 16641, 16644, 16646, 16649, 16658, 16661, 16662, 16664, 16666, 16673,
10060
+ 16678, 16681, 16709, 16712, 16714, 16721, 16724, 16725, 16726, 16729, 16730, 16741, 16744, 16746, 16769, 16772,
10061
+ 16774, 16784, 16786, 16789, 16800, 16801, 16802, 16901, 16913, 16916, 16918, 16933, 16961, 16978, 16981, 16986,
10062
+ 16996, 17001, 17033, 17044, 17061, 17409, 17429, 17433, 17449, 17477, 17480, 17482, 17489, 17492, 17493, 17494,
10063
+ 17505, 17506, 17509, 17512, 17514, 17537, 17542, 17545, 17552, 17554, 17557, 17568, 17569, 17577, 17665, 17666,
10064
+ 17669, 17674, 17681, 17684, 17685, 17686, 17689, 17696, 17701, 17706, 17729, 17732, 17733, 17734, 17737, 17744,
10065
+ 17745, 17748, 17749, 17750, 17752, 17753, 17761, 17764, 17765, 17766, 17769, 17794, 17796, 17797, 17800, 17809,
10066
+ 17812, 17813, 17814, 17817, 17818, 17829, 17832, 17834, 17921, 17925, 17929, 17940, 17941, 17944, 17946, 17953,
10067
+ 17956, 17961, 17984, 17986, 17989, 17992, 18000, 18001, 18002, 18005, 18006, 18009, 18018, 18021, 18024, 18049,
10068
+ 18053, 18058, 18068, 18069, 18081, 18084, 18086, 18437, 18449, 18453, 18458, 18469, 18498, 18505, 18512, 18517,
10069
+ 18520, 18529, 18532, 18534, 18537, 18565, 18577, 18580, 18582, 18585, 18597, 18689, 18693, 18694, 18698, 18704,
10070
+ 18708, 18709, 18712, 18721, 18724, 18726, 18752, 18757, 18762, 18769, 18770, 18772, 18773, 18774, 18777, 18784,
10071
+ 18786, 18789, 18790, 18794, 18822, 18825, 18834, 18837, 18838, 18840, 18849, 18852, 18854, 18857, 18966, 19012,
10072
+ 19014, 19017, 19029, 19032, 19034, 19044, 19049, 19092, 19109, 20481, 20484, 20485, 20486, 20489, 20498, 20501,
10073
+ 20506, 20513, 20516, 20521, 20544, 20549, 20552, 20561, 20564, 20565, 20566, 20569, 20581, 20584, 20614, 20617,
10074
+ 20629, 20632, 20640, 20641, 20646, 20649, 20741, 20744, 20745, 20746, 20753, 20756, 20757, 20758, 20760, 20761,
10075
+ 20768, 20773, 20774, 20776, 20778, 20801, 20804, 20805, 20806, 20809, 20816, 20817, 20818, 20820, 20821, 20822,
10076
+ 20824, 20825, 20826, 20833, 20836, 20837, 20838, 20841, 20866, 20869, 20881, 20884, 20885, 20886, 20889, 20896,
10077
+ 20901, 20906, 20993, 20998, 21010, 21013, 21018, 21025, 21028, 21058, 21061, 21066, 21073, 21076, 21077, 21078,
10078
+ 21081, 21090, 21093, 21125, 21136, 21138, 21141, 21145, 21146, 21156, 21508, 21509, 21521, 21524, 21525, 21526,
10079
+ 21528, 21529, 21537, 21541, 21544, 21546, 21569, 21572, 21573, 21574, 21577, 21578, 21584, 21585, 21588, 21589,
10080
+ 21590, 21592, 21593, 21594, 21601, 21602, 21604, 21605, 21606, 21609, 21632, 21640, 21642, 21649, 21652, 21653,
10081
+ 21654, 21657, 21665, 21668, 21669, 21674, 21761, 21762, 21764, 21765, 21766, 21769, 21776, 21777, 21778, 21780,
10082
+ 21781, 21782, 21785, 21786, 21793, 21796, 21797, 21798, 21801, 21824, 21825, 21826, 21828, 21829, 21830, 21832,
10083
+ 21833, 21840, 21841, 21842, 21844, 21845, 21846, 21848, 21849, 21850, 21856, 21857, 21860, 21861, 21862, 21864,
10084
+ 21865, 21866, 21889, 21892, 21893, 21897, 21898, 21904, 21905, 21908, 21909, 21910, 21912, 21913, 21921, 21924,
10085
+ 21925, 21926, 21929, 22016, 22017, 22018, 22020, 22022, 22024, 22025, 22033, 22036, 22037, 22040, 22041, 22048,
10086
+ 22049, 22050, 22052, 22053, 22054, 22056, 22057, 22081, 22085, 22086, 22088, 22089, 22090, 22096, 22097, 22098,
10087
+ 22100, 22101, 22102, 22104, 22105, 22106, 22113, 22116, 22117, 22121, 22146, 22149, 22150, 22152, 22153, 22154,
10088
+ 22161, 22165, 22170, 22178, 22181, 22182, 22184, 22185, 22532, 22533, 22534, 22537, 22544, 22549, 22552, 22561,
10089
+ 22570, 22597, 22600, 22602, 22609, 22612, 22613, 22614, 22616, 22617, 22624, 22626, 22628, 22629, 22658, 22665,
10090
+ 22672, 22674, 22677, 22680, 22689, 22697, 22785, 22786, 22789, 22794, 22801, 22804, 22805, 22806, 22809, 22821,
10091
+ 22849, 22852, 22853, 22854, 22857, 22864, 22865, 22866, 22868, 22869, 22870, 22872, 22873, 22874, 22881, 22884,
10092
+ 22885, 22886, 22889, 22913, 22917, 22921, 22929, 22932, 22933, 22934, 22936, 22937, 22949, 23044, 23048, 23061,
10093
+ 23066, 23072, 23077, 23078, 23081, 23109, 23112, 23113, 23121, 23125, 23126, 23128, 23129, 23138, 23141, 23144,
10094
+ 23146, 23169, 23178, 23186, 23189, 23190, 23192, 23194, 23201, 24581, 24596, 24598, 24601, 24613, 24644, 24656,
10095
+ 24661, 24662, 24664, 24666, 24673, 24676, 24678, 24681, 24705, 24726, 24741, 24833, 24836, 24838, 24841, 24850,
10096
+ 24853, 24865, 24866, 24870, 24873, 24901, 24905, 24913, 24917, 24918, 24921, 24933, 24934, 24938, 24964, 24970,
10097
+ 24978, 24981, 24993, 24998, 25001, 25105, 25110, 25113, 25152, 25153, 25158, 25173, 25174, 25176, 25184, 25221,
10098
+ 25233, 25238, 25253, 25617, 25618, 25621, 25622, 25626, 25633, 25638, 25641, 25664, 25666, 25669, 25672, 25674,
10099
+ 25681, 25684, 25685, 25686, 25689, 25690, 25696, 25698, 25701, 25732, 25733, 25737, 25744, 25746, 25748, 25749,
10100
+ 25750, 25752, 25754, 25761, 25764, 25769, 25861, 25864, 25866, 25873, 25877, 25878, 25881, 25924, 25925, 25926,
10101
+ 25929, 25936, 25937, 25940, 25941, 25942, 25945, 25953, 25956, 25957, 25958, 25961, 25990, 25993, 25994, 26001,
10102
+ 26005, 26006, 26009, 26010, 26018, 26021, 26022, 26024, 26114, 26121, 26133, 26144, 26150, 26152, 26153, 26176,
10103
+ 26181, 26184, 26186, 26193, 26196, 26197, 26198, 26200, 26202, 26208, 26213, 26216, 26240, 26242, 26245, 26250,
10104
+ 26260, 26262, 26264, 26265, 26272, 26276, 26278, 26282, 26646, 26649, 26661, 26689, 26706, 26709, 26714, 26721,
10105
+ 26729, 26757, 26769, 26776, 26790, 26881, 26884, 26896, 26901, 26913, 26916, 26918, 26921, 26944, 26945, 26949,
10106
+ 26950, 26952, 26961, 26964, 26965, 26966, 26969, 26976, 26981, 26986, 27010, 27012, 27018, 27029, 27041, 27044,
10107
+ 27045, 27049, 27153, 27158, 27160, 27201, 27204, 27209, 27216, 27221, 27224, 27226, 27236, 27237, 27241, 27270,
10108
+ 27284, 27288, 27290, 27302, 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810, 32837, 32848, 32849, 32852,
10109
+ 32854, 32857, 32869, 32896, 32898, 32904, 32906, 32917, 32928, 32930, 32936, 32938, 33029, 33041, 33044, 33046,
10110
+ 33049, 33061, 33089, 33092, 33097, 33104, 33106, 33109, 33110, 33112, 33113, 33124, 33126, 33129, 33157, 33161,
10111
+ 33172, 33174, 33177, 33189, 33280, 33282, 33288, 33290, 33301, 33312, 33314, 33320, 33322, 33361, 33364, 33369,
10112
+ 33381, 33408, 33410, 33416, 33418, 33429, 33440, 33442, 33448, 33450, 33812, 33817, 33857, 33860, 33873, 33877,
10113
+ 33882, 33889, 33892, 33897, 33940, 33945, 34049, 34057, 34066, 34069, 34074, 34086, 34089, 34112, 34113, 34117,
10114
+ 34120, 34129, 34132, 34133, 34134, 34137, 34138, 34149, 34150, 34152, 34154, 34177, 34180, 34182, 34185, 34192,
10115
+ 34194, 34197, 34200, 34214, 34321, 34326, 34329, 34341, 34369, 34372, 34377, 34378, 34384, 34389, 34393, 34394,
10116
+ 34401, 34406, 34410, 34437, 34449, 34458, 34468, 34816, 34818, 34824, 34826, 34837, 34848, 34850, 34856, 34858,
10117
+ 34881, 34885, 34897, 34900, 34905, 34917, 34921, 34944, 34946, 34952, 34954, 34965, 34976, 34978, 34984, 34986,
10118
+ 35077, 35078, 35089, 35092, 35094, 35109, 35137, 35140, 35142, 35145, 35152, 35154, 35157, 35162, 35169, 35172,
10119
+ 35205, 35222, 35225, 35237, 35328, 35330, 35336, 35338, 35349, 35360, 35362, 35368, 35370, 35397, 35409, 35412,
10120
+ 35414, 35456, 35458, 35464, 35466, 35477, 35488, 35490, 35496, 35498, 36869, 36881, 36886, 36888, 36889, 36901,
10121
+ 36929, 36934, 36937, 36949, 36952, 36954, 36969, 36970, 36997, 37009, 37012, 37014, 37017, 37029, 37121, 37124,
10122
+ 37126, 37129, 37136, 37141, 37144, 37146, 37153, 37156, 37158, 37161, 37184, 37189, 37200, 37201, 37204, 37205,
10123
+ 37206, 37209, 37218, 37221, 37252, 37254, 37266, 37269, 37272, 37281, 37284, 37286, 37289, 37381, 37393, 37396,
10124
+ 37401, 37413, 37444, 37446, 37449, 37456, 37458, 37461, 37464, 37478, 37481, 37509, 37524, 37526, 37545, 37889,
10125
+ 37892, 37894, 37904, 37909, 37912, 37926, 37952, 37962, 37969, 37972, 37973, 37974, 37976, 37977, 37984, 37985,
10126
+ 37986, 37989, 38020, 38022, 38034, 38036, 38037, 38040, 38049, 38057, 38144, 38149, 38152, 38154, 38160, 38161,
10127
+ 38164, 38165, 38166, 38169, 38177, 38181, 38185, 38186, 38209, 38212, 38213, 38214, 38217, 38224, 38225, 38226,
10128
+ 38228, 38229, 38230, 38232, 38233, 38234, 38241, 38244, 38245, 38246, 38249, 38273, 38277, 38280, 38289, 38290,
10129
+ 38292, 38293, 38294, 38297, 38298, 38304, 38306, 38309, 38312, 38314, 38401, 38404, 38416, 38421, 38425, 38432,
10130
+ 38438, 38441, 38469, 38472, 38473, 38481, 38482, 38485, 38486, 38489, 38501, 38504, 38530, 38532, 38537, 38538,
10131
+ 38546, 38548, 38549, 38564, 38566, 38569, 38917, 38934, 38937, 38949, 38977, 38982, 38992, 38994, 38997, 38998,
10132
+ 39002, 39012, 39013, 39045, 39057, 39062, 39065, 39077, 39172, 39174, 39177, 39184, 39186, 39189, 39192, 39194,
10133
+ 39200, 39201, 39204, 39206, 39232, 39234, 39237, 39240, 39242, 39249, 39252, 39253, 39254, 39257, 39266, 39269,
10134
+ 39270, 39274, 39297, 39300, 39312, 39314, 39317, 39322, 39329, 39334, 39429, 39445, 39461, 39492, 39494, 39497,
10135
+ 39504, 39509, 39512, 39521, 39557, 39569, 39572, 39573, 39574, 40960, 40962, 40968, 40970, 40981, 40992, 40994,
10136
+ 41000, 41002, 41029, 41041, 41044, 41046, 41049, 41088, 41090, 41096, 41098, 41109, 41120, 41122, 41128, 41130,
10137
+ 41221, 41225, 41233, 41236, 41238, 41241, 41242, 41286, 41289, 41297, 41301, 41304, 41306, 41313, 41316, 41349,
10138
+ 41360, 41362, 41366, 41369, 41474, 41480, 41482, 41488, 41497, 41506, 41512, 41514, 41541, 41553, 41558, 41561,
10139
+ 41573, 41600, 41602, 41608, 41610, 41621, 41632, 41634, 41640, 41642, 42009, 42021, 42049, 42052, 42064, 42068,
10140
+ 42069, 42072, 42074, 42081, 42085, 42086, 42088, 42089, 42117, 42246, 42249, 42256, 42258, 42261, 42264, 42278,
10141
+ 42281, 42306, 42309, 42321, 42324, 42325, 42326, 42329, 42341, 42346, 42369, 42372, 42373, 42374, 42377, 42386,
10142
+ 42389, 42392, 42501, 42513, 42518, 42522, 42529, 42533, 42564, 42566, 42570, 42578, 42581, 42582, 42584, 42592,
10143
+ 42594, 42630, 42640, 42645, 42646, 42649, 42657, 42660, 42662, 43008, 43010, 43016, 43018, 43040, 43042, 43048,
10144
+ 43050, 43089, 43092, 43094, 43097, 43136, 43138, 43144, 43146, 43157, 43168, 43170, 43176, 43178, 43269, 43284,
10145
+ 43289, 43297, 43301, 43329, 43344, 43349, 43354, 43361, 43366, 43369, 43408, 43414, 43520, 43522, 43528, 43530,
10146
+ 43552, 43554, 43560, 43562, 43601, 43604, 43606, 43648, 43650, 43656, 43658, 43669, 43680, 43682, 43688, 43690,
10847
10147
  };
10848
10148
  static const uint16_t kgrid_2bit_1024[1024] = {
10849
10149
  0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
@@ -10917,12 +10217,12 @@ void iq2xs_init_impl(enum ggml_type type) {
10917
10217
  const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10918
10218
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10919
10219
  type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10920
- type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
10220
+ type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
10921
10221
  uint64_t * kgrid_q2xs;
10922
10222
  int * kmap_q2xs;
10923
10223
  uint16_t * kneighbors_q2xs;
10924
10224
 
10925
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10225
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10926
10226
  uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
10927
10227
  for (int k = 0; k < grid_size; ++k) {
10928
10228
  int8_t * pos = (int8_t *)(the_grid + k);
@@ -10977,7 +10277,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10977
10277
  }
10978
10278
  num_neighbors += n;
10979
10279
  }
10980
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10280
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10981
10281
  kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10982
10282
  iq2_data[gindex].neighbours = kneighbors_q2xs;
10983
10283
  int counter = 0;
@@ -11400,8 +10700,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
11400
10700
  }
11401
10701
  }
11402
10702
 
11403
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11404
- (void)hist;
10703
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11405
10704
  GGML_ASSERT(n_per_row%QK_K == 0);
11406
10705
  int nblock = n_per_row/QK_K;
11407
10706
  char * qrow = (char *)dst;
@@ -11413,8 +10712,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
11413
10712
  return nrow * nblock * sizeof(block_iq2_xxs);
11414
10713
  }
11415
10714
 
11416
- size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11417
- (void)hist;
10715
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11418
10716
  GGML_ASSERT(n_per_row%QK_K == 0);
11419
10717
  int nblock = n_per_row/QK_K;
11420
10718
  char * qrow = (char *)dst;
@@ -11518,7 +10816,7 @@ void iq3xs_init_impl(int grid_size) {
11518
10816
  int * kmap_q3xs;
11519
10817
  uint16_t * kneighbors_q3xs;
11520
10818
 
11521
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10819
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
11522
10820
  uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
11523
10821
  for (int k = 0; k < grid_size; ++k) {
11524
10822
  int8_t * pos = (int8_t *)(the_grid + k);
@@ -11573,7 +10871,7 @@ void iq3xs_init_impl(int grid_size) {
11573
10871
  }
11574
10872
  num_neighbors += n;
11575
10873
  }
11576
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10874
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
11577
10875
  kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
11578
10876
  iq3_data[gindex].neighbours = kneighbors_q3xs;
11579
10877
  int counter = 0;
@@ -11856,8 +11154,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11856
11154
  }
11857
11155
  }
11858
11156
 
11859
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11860
- (void)hist;
11157
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11861
11158
  GGML_ASSERT(n_per_row%QK_K == 0);
11862
11159
  int nblock = n_per_row/QK_K;
11863
11160
  char * qrow = (char *)dst;
@@ -12063,8 +11360,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
12063
11360
  }
12064
11361
 
12065
11362
  #define IQ3S_BLOCK_SIZE 32
12066
- size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12067
- (void)hist;
11363
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12068
11364
  GGML_ASSERT(n_per_row%QK_K == 0);
12069
11365
  int nblock = n_per_row/QK_K;
12070
11366
  float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -12094,7 +11390,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
12094
11390
 
12095
11391
  void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
12096
11392
  assert(k % QK_K == 0);
12097
- quantize_iq3_s(x, y, 1, k, NULL, NULL);
11393
+ quantize_iq3_s(x, y, 1, k, NULL);
12098
11394
  }
12099
11395
 
12100
11396
 
@@ -12160,12 +11456,70 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
12160
11456
  return grid_index;
12161
11457
  }
12162
11458
 
11459
+ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
11460
+ const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
11461
+ int num_neighbors = neighbours[0];
11462
+ GGML_ASSERT(num_neighbors > 0);
11463
+ float best_score = FLT_MAX;
11464
+ int grid_index = -1;
11465
+ for (int j = 1; j <= num_neighbors; ++j) {
11466
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
11467
+ float d2 = 0;
11468
+ for (int i = 0; i < 8; ++i) {
11469
+ float q = xg[(pg[i] - 1)/2];
11470
+ float w = weight[i];
11471
+ float diff = scale*q - xval[i];
11472
+ d2 += w*diff*diff;
11473
+ }
11474
+ if (d2 < best_score) {
11475
+ best_score = d2;
11476
+ grid_index = neighbours[j];
11477
+ }
11478
+ }
11479
+ if (grid_index < 0) {
11480
+ for (int i = 0; i < ngrid; ++i) {
11481
+ const int8_t * grid_i = (const int8_t *)(grid + i);
11482
+ float d2 = 0;
11483
+ for (int j = 0; j < 8; ++j) {
11484
+ float w = weight[j];
11485
+ float q = xg[(grid_i[j] - 1)/2];
11486
+ float diff = scale*q - xval[i];
11487
+ d2 += w*diff*diff;
11488
+ }
11489
+ if (d2 < best_score) {
11490
+ best_score = d2;
11491
+ grid_index = i;
11492
+ }
11493
+ }
11494
+ }
11495
+ if (grid_index < 0) {
11496
+ printf("Oops, did not find grid point\n");
11497
+ printf("Have %d neighbours\n", num_neighbors);
11498
+ for (int j = 1; j <= num_neighbors; ++j) {
11499
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
11500
+ float sumqx = 0, sumq2 = 0;
11501
+ for (int i = 0; i < 8; ++i) {
11502
+ float q = xg[(pg[i] - 1)/2];
11503
+ float w = weight[i];
11504
+ sumqx += w*q*xval[i];
11505
+ sumq2 += w*q*q;
11506
+ }
11507
+ printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
11508
+ }
11509
+ }
11510
+ GGML_ASSERT(grid_index >= 0);
11511
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
11512
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
11513
+ return grid_index;
11514
+ }
11515
+
12163
11516
  static int iq1_sort_helper(const void * left, const void * right) {
12164
11517
  const float * l = left;
12165
11518
  const float * r = right;
12166
11519
  return *l < *r ? -1 : *l > *r ? 1 : 0;
12167
11520
  }
12168
11521
 
11522
+ #define IQ1S_BLOCK_SIZE 32
12169
11523
  static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
12170
11524
 
12171
11525
  const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
@@ -12184,37 +11538,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
12184
11538
 
12185
11539
  block_iq1_s * y = vy;
12186
11540
 
12187
- float scales[QK_K/8];
12188
- float weight[8];
12189
- int8_t L[8];
12190
- float sumx[9];
12191
- float sumw[9];
12192
- float pairs[16];
11541
+ const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
11542
+ const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
11543
+
11544
+ float scales[QK_K/IQ1S_BLOCK_SIZE];
11545
+ float weight[IQ1S_BLOCK_SIZE];
11546
+ int8_t L[IQ1S_BLOCK_SIZE];
11547
+ float sumx[IQ1S_BLOCK_SIZE+1];
11548
+ float sumw[IQ1S_BLOCK_SIZE+1];
11549
+ float pairs[2*IQ1S_BLOCK_SIZE];
12193
11550
  int * idx = (int *)(pairs + 1);
12194
- uint8_t hbit[QK_K/8];
11551
+ uint16_t index[IQ1S_BLOCK_SIZE/8];
11552
+ int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
12195
11553
 
12196
11554
  for (int ibl = 0; ibl < nbl; ++ibl) {
12197
11555
 
12198
11556
  y[ibl].d = GGML_FP32_TO_FP16(0.f);
12199
11557
  memset(y[ibl].qs, 0, QK_K/8);
12200
- memset(y[ibl].scales, 0, QK_K/16);
11558
+ memset(y[ibl].qh, 0, QK_K/16);
12201
11559
 
12202
11560
  float max_scale = 0;
12203
11561
 
12204
11562
  const float * xbl = x + QK_K*ibl;
12205
11563
  float sumx2 = 0;
12206
11564
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12207
- float sigma2 = sumx2/QK_K;
11565
+ float sigma2 = 2*sumx2/QK_K;
12208
11566
 
12209
- for (int ib = 0; ib < QK_K/8; ++ib) {
12210
- const float * xb = xbl + 8*ib;
12211
- const float * qw = quant_weights + QK_K*ibl + 8*ib;
12212
- for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11567
+ for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11568
+ const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
11569
+ const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
11570
+ for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12213
11571
  float max = fabsf(xb[0]);
12214
- for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
11572
+ for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
12215
11573
  if (!max) {
12216
11574
  scales[ib] = 0;
12217
- memset(L, 1, 8);
11575
+ memset(L, 1, IQ1S_BLOCK_SIZE);
12218
11576
  continue;
12219
11577
  }
12220
11578
  // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
@@ -12223,52 +11581,81 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
12223
11581
  // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
12224
11582
  // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
12225
11583
  // for each possible and score for each split.
12226
- for (int j = 0; j < 8; ++j) {
11584
+ for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
12227
11585
  pairs[2*j] = xb[j];
12228
11586
  idx[2*j] = j;
12229
11587
  }
12230
- qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
11588
+ qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
12231
11589
  {
12232
11590
  sumx[0] = sumw[0] = 0;
12233
- for (int j = 0; j < 8; ++j) {
11591
+ for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
12234
11592
  int i = idx[2*j];
12235
11593
  sumx[j+1] = sumx[j] + weight[i]*xb[i];
12236
11594
  sumw[j+1] = sumw[j] + weight[i];
12237
11595
  }
12238
11596
  }
12239
11597
  float best_score = 0, scale = max;
12240
- int besti1 = 0, besti2 = 0;
12241
- for (int i1 = 0; i1 <= 8; ++i1) {
12242
- for (int i2 = i1; i2 <= 8; ++i2) {
12243
- float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
12244
- float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
11598
+ int besti1 = -1, besti2 = -1, best_shift = 0;
11599
+ for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
11600
+ for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
11601
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
11602
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
12245
11603
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
12246
11604
  scale = sumqx/sumq2; best_score = scale*sumqx;
12247
- besti1 = i1; besti2 = i2;
11605
+ besti1 = i1; besti2 = i2; best_shift = 1;
11606
+ }
11607
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
11608
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
11609
+ if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11610
+ scale = sumqx/sumq2; best_score = scale*sumqx;
11611
+ besti1 = i1; besti2 = i2; best_shift = -1;
12248
11612
  }
12249
11613
  }
12250
11614
  }
11615
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
12251
11616
  for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
12252
11617
  for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
12253
- for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2;
11618
+ for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
12254
11619
  if (scale < 0) {
12255
- for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
12256
- scale = -scale;
11620
+ for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
11621
+ scale = -scale; best_shift = -best_shift;
11622
+ }
11623
+ bool all_on_grid = true;
11624
+ const float * xx = best_shift == 1 ? x_p : x_m;
11625
+ for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11626
+ uint16_t u = 0;
11627
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
11628
+ int grid_index = kmap_q2xs[u];
11629
+ if (grid_index < 0) {
11630
+ all_on_grid = false;
11631
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
11632
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
11633
+ GGML_ASSERT(grid_index >= 0);
11634
+ }
11635
+ index[k] = grid_index;
11636
+ }
11637
+ if (!all_on_grid) {
11638
+ float sumqx = 0, sumq2 = 0;
11639
+ for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11640
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
11641
+ for (int j = 0; j < 8; ++j) {
11642
+ float w = weight[8*k + j];
11643
+ float q = xx[(pg[j] - 1)/2];
11644
+ sumqx += w*q*xb[8*k+j];
11645
+ sumq2 += w*q*q;
11646
+ }
11647
+ }
11648
+ if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
11649
+ }
11650
+ uint16_t h = 0;
11651
+ for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11652
+ y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
11653
+ h |= (index[k] >> 8) << 3*k;
12257
11654
  }
12258
- // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
12259
- // grid point that minimizes SSD.
12260
- uint16_t u = 0;
12261
- for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
12262
- int grid_index = kmap_q2xs[u];
12263
- if (grid_index < 0) {
12264
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12265
- grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
12266
- GGML_ASSERT(grid_index >= 0);
12267
- }
12268
- y[ibl].qs[ib] = grid_index & 255;
12269
- hbit[ib] = grid_index >> 8;
11655
+ y[ibl].qh[ib] = h;
12270
11656
  GGML_ASSERT(scale >= 0);
12271
11657
  scales[ib] = scale;
11658
+ shifts[ib] = best_shift;
12272
11659
  max_scale = MAX(max_scale, scale);
12273
11660
  }
12274
11661
 
@@ -12278,19 +11665,18 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
12278
11665
  }
12279
11666
 
12280
11667
  float d = max_scale/15;
12281
- y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
11668
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
12282
11669
  float id = 1/d;
12283
- for (int ib = 0; ib < QK_K/8; ++ib) {
11670
+ for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
12284
11671
  int l = nearest_int(0.5f*(id*scales[ib]-1));
12285
11672
  l = MAX(0, MIN(7, l));
12286
- if (hbit[ib]) l |= 8;
12287
- y[ibl].scales[ib/2] |= (l << 4*(ib%2));
11673
+ if (shifts[ib] == -1) l |= 8;
11674
+ y[ibl].qh[ib] |= (l << 12);
12288
11675
  }
12289
11676
  }
12290
11677
  }
12291
11678
 
12292
- size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12293
- (void)hist;
11679
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12294
11680
  GGML_ASSERT(n_per_row%QK_K == 0);
12295
11681
  int nblock = n_per_row/QK_K;
12296
11682
  char * qrow = (char *)dst;
@@ -12315,7 +11701,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
12315
11701
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
12316
11702
  }
12317
11703
 
12318
- static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
11704
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
12319
11705
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
12320
11706
  float * scales, float * weight, uint8_t * L,
12321
11707
  const int8_t * values,
@@ -12423,8 +11809,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
12423
11809
  }
12424
11810
  }
12425
11811
 
12426
- size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12427
- (void)hist;
11812
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12428
11813
  GGML_ASSERT(n_per_row%QK4_NL == 0);
12429
11814
  int nblock = n_per_row/QK4_NL;
12430
11815
  char * qrow = (char *)dst;
@@ -12454,14 +11839,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
12454
11839
 
12455
11840
  void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
12456
11841
  assert(k % QK4_NL == 0);
12457
- quantize_iq4_nl(x, y, 1, k, NULL, NULL);
11842
+ quantize_iq4_nl(x, y, 1, k, NULL);
12458
11843
  }
12459
11844
 
12460
- size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11845
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12461
11846
  #if QK_K == 64
12462
- return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
11847
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
12463
11848
  #else
12464
- (void)hist;
12465
11849
  GGML_ASSERT(n_per_row%QK_K == 0);
12466
11850
  int nblock = n_per_row/QK_K;
12467
11851
  char * qrow = (char *)dst;
@@ -12490,7 +11874,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
12490
11874
 
12491
11875
  void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
12492
11876
  assert(k % QK_K == 0);
12493
- quantize_iq4_xs(x, y, 1, k, NULL, NULL);
11877
+ quantize_iq4_xs(x, y, 1, k, NULL);
12494
11878
  }
12495
11879
 
12496
11880
  // =============================== 2.5625 bpw
@@ -12663,8 +12047,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
12663
12047
  }
12664
12048
  }
12665
12049
 
12666
- size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
12667
- (void)hist;
12050
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12668
12051
  GGML_ASSERT(n_per_row%QK_K == 0);
12669
12052
  int nblock = n_per_row/QK_K;
12670
12053
  char * qrow = (char *)dst;
@@ -12678,7 +12061,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
12678
12061
 
12679
12062
  void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
12680
12063
  assert(k % QK_K == 0);
12681
- quantize_iq2_s(x, y, 1, k, NULL, NULL);
12064
+ quantize_iq2_s(x, y, 1, k, NULL);
12682
12065
  }
12683
12066
 
12684
12067
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {