llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,6 +5,8 @@
5
5
  #include <string.h>
6
6
  #include <assert.h>
7
7
  #include <float.h>
8
+ #include <stdlib.h> // for qsort
9
+ #include <stdio.h> // for GGML_ASSERT
8
10
 
9
11
  #ifdef __ARM_NEON
10
12
 
@@ -272,10 +274,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
272
274
 
273
275
  // vaddvq_s16
274
276
  // vpaddq_s16
277
+ // vpaddq_s32
275
278
  // vaddvq_s32
276
279
  // vaddvq_f32
277
280
  // vmaxvq_f32
278
281
  // vcvtnq_s32_f32
282
+ // vzip1_u8
283
+ // vzip2_u8
279
284
 
280
285
  inline static int32_t vaddvq_s16(int16x8_t v) {
281
286
  return
@@ -291,6 +296,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
291
296
  return vcombine_s16(a0, b0);
292
297
  }
293
298
 
299
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
300
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
301
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
302
+ return vcombine_s32(a0, b0);
303
+ }
304
+
294
305
  inline static int32_t vaddvq_s32(int32x4_t v) {
295
306
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
296
307
  }
@@ -316,6 +327,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
316
327
  return res;
317
328
  }
318
329
 
330
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
331
+ uint8x8_t res;
332
+
333
+ res[0] = a[0]; res[1] = b[0];
334
+ res[2] = a[1]; res[3] = b[1];
335
+ res[4] = a[2]; res[5] = b[2];
336
+ res[6] = a[3]; res[7] = b[3];
337
+
338
+ return res;
339
+ }
340
+
341
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
342
+ uint8x8_t res;
343
+
344
+ res[0] = a[4]; res[1] = b[4];
345
+ res[2] = a[5]; res[3] = b[5];
346
+ res[4] = a[6]; res[5] = b[6];
347
+ res[6] = a[7]; res[7] = b[7];
348
+
349
+ return res;
350
+ }
351
+
319
352
  // vld1q_s16_x2
320
353
  // vld1q_u8_x2
321
354
  // vld1q_u8_x4
@@ -482,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
482
515
  quantize_row_q4_0_reference(x, y, k);
483
516
  }
484
517
 
518
+
485
519
  void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
486
520
  const int qk = QK4_1;
487
521
 
@@ -1211,7 +1245,8 @@ static inline int nearest_int(float fval) {
1211
1245
  return (i & 0x007fffff) - 0x00400000;
1212
1246
  }
1213
1247
 
1214
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
1248
+ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1249
+ const float * restrict qw) {
1215
1250
  float max = 0;
1216
1251
  float amax = 0;
1217
1252
  for (int i = 0; i < n; ++i) {
@@ -1237,14 +1272,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1237
1272
  rmse_type = -rmse_type;
1238
1273
  return_early = true;
1239
1274
  }
1240
- int weight_type = rmse_type%2;
1241
1275
  float sumlx = 0;
1242
1276
  float suml2 = 0;
1243
1277
  for (int i = 0; i < n; ++i) {
1244
1278
  int l = nearest_int(iscale * x[i]);
1245
1279
  l = MAX(-nmax, MIN(nmax-1, l));
1246
1280
  L[i] = l + nmax;
1247
- float w = weight_type == 1 ? x[i] * x[i] : 1;
1281
+ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
1248
1282
  sumlx += w*x[i]*l;
1249
1283
  suml2 += w*l*l;
1250
1284
  }
@@ -1260,7 +1294,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1260
1294
  for (int i = 0; i < n; ++i) {
1261
1295
  int l = nearest_int(iscale * x[i]);
1262
1296
  l = MAX(-nmax, MIN(nmax-1, l));
1263
- float w = weight_type == 1 ? x[i] * x[i] : 1;
1297
+ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
1264
1298
  sumlx += w*x[i]*l;
1265
1299
  suml2 += w*l*l;
1266
1300
  }
@@ -1608,6 +1642,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
1608
1642
  return (n/QK_K*sizeof(block_q2_K));
1609
1643
  }
1610
1644
 
1645
+ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1646
+ uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1647
+ float rmin, float rdelta, int nstep, bool use_mad) {
1648
+ float min = x[0];
1649
+ float max = x[0];
1650
+ float sum_w = weights ? weights[0] : x[0]*x[0];
1651
+ float sum_x = sum_w * x[0];
1652
+ for (int i = 1; i < n; ++i) {
1653
+ if (x[i] < min) min = x[i];
1654
+ if (x[i] > max) max = x[i];
1655
+ float w = weights ? weights[i] : x[i]*x[i];
1656
+ sum_w += w;
1657
+ sum_x += w * x[i];
1658
+ }
1659
+ if (min > 0) {
1660
+ min = 0;
1661
+ }
1662
+ if (max <= min) {
1663
+ for (int i = 0; i < n; ++i) L[i] = 0;
1664
+ *the_min = -min;
1665
+ return 0.f;
1666
+ }
1667
+ float iscale = nmax/(max - min);
1668
+ float scale = 1/iscale;
1669
+ float best_mad = 0;
1670
+ for (int i = 0; i < n; ++i) {
1671
+ int l = nearest_int(iscale*(x[i] - min));
1672
+ L[i] = MAX(0, MIN(nmax, l));
1673
+ float diff = scale * L[i] + min - x[i];
1674
+ diff = use_mad ? fabsf(diff) : diff*diff;
1675
+ float w = weights ? weights[i] : x[i]*x[i];
1676
+ best_mad += w * diff;
1677
+ }
1678
+ if (nstep < 1) {
1679
+ *the_min = -min;
1680
+ return scale;
1681
+ }
1682
+ for (int is = 0; is <= nstep; ++is) {
1683
+ iscale = (rmin + rdelta*is + nmax)/(max - min);
1684
+ float sum_l = 0, sum_l2 = 0, sum_xl = 0;
1685
+ for (int i = 0; i < n; ++i) {
1686
+ int l = nearest_int(iscale*(x[i] - min));
1687
+ l = MAX(0, MIN(nmax, l));
1688
+ Laux[i] = l;
1689
+ float w = weights ? weights[i] : x[i]*x[i];
1690
+ sum_l += w*l;
1691
+ sum_l2 += w*l*l;
1692
+ sum_xl += w*l*x[i];
1693
+ }
1694
+ float D = sum_w * sum_l2 - sum_l * sum_l;
1695
+ if (D > 0) {
1696
+ float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
1697
+ float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
1698
+ if (this_min > 0) {
1699
+ this_min = 0;
1700
+ this_scale = sum_xl / sum_l2;
1701
+ }
1702
+ float mad = 0;
1703
+ for (int i = 0; i < n; ++i) {
1704
+ float diff = this_scale * Laux[i] + this_min - x[i];
1705
+ diff = use_mad ? fabsf(diff) : diff*diff;
1706
+ float w = weights ? weights[i] : x[i]*x[i];
1707
+ mad += w * diff;
1708
+ }
1709
+ if (mad < best_mad) {
1710
+ for (int i = 0; i < n; ++i) {
1711
+ L[i] = Laux[i];
1712
+ }
1713
+ best_mad = mad;
1714
+ scale = this_scale;
1715
+ min = this_min;
1716
+ }
1717
+ }
1718
+ }
1719
+ *the_min = -min;
1720
+ return scale;
1721
+ }
1722
+
1723
+ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
1724
+ float max = 0;
1725
+ for (int i = 0; i < n; ++i) {
1726
+ max = MAX(max, x[i]);
1727
+ }
1728
+ if (!max) { // all zero
1729
+ for (int i = 0; i < n; ++i) { L[i] = 0; }
1730
+ return 0.f;
1731
+ }
1732
+ float iscale = nmax / max;
1733
+ for (int i = 0; i < n; ++i) {
1734
+ L[i] = nearest_int(iscale * x[i]);
1735
+ }
1736
+ float scale = 1/iscale;
1737
+ float best_mse = 0;
1738
+ for (int i = 0; i < n; ++i) {
1739
+ float diff = x[i] - scale*L[i];
1740
+ float w = quant_weights[i];
1741
+ best_mse += w*diff*diff;
1742
+ }
1743
+ for (int is = -4; is <= 4; ++is) {
1744
+ if (is == 0) continue;
1745
+ float iscale_is = (0.1f*is + nmax)/max;
1746
+ float scale_is = 1/iscale_is;
1747
+ float mse = 0;
1748
+ for (int i = 0; i < n; ++i) {
1749
+ int l = nearest_int(iscale_is*x[i]);
1750
+ l = MIN(nmax, l);
1751
+ float diff = x[i] - scale_is*l;
1752
+ float w = quant_weights[i];
1753
+ mse += w*diff*diff;
1754
+ }
1755
+ if (mse < best_mse) {
1756
+ best_mse = mse;
1757
+ iscale = iscale_is;
1758
+ }
1759
+ }
1760
+ float sumlx = 0;
1761
+ float suml2 = 0;
1762
+ for (int i = 0; i < n; ++i) {
1763
+ int l = nearest_int(iscale * x[i]);
1764
+ l = MIN(nmax, l);
1765
+ L[i] = l;
1766
+ float w = quant_weights[i];
1767
+ sumlx += w*x[i]*l;
1768
+ suml2 += w*l*l;
1769
+ }
1770
+ for (int itry = 0; itry < 5; ++itry) {
1771
+ int n_changed = 0;
1772
+ for (int i = 0; i < n; ++i) {
1773
+ float w = quant_weights[i];
1774
+ float slx = sumlx - w*x[i]*L[i];
1775
+ float sl2 = suml2 - w*L[i]*L[i];
1776
+ if (slx > 0 && sl2 > 0) {
1777
+ int new_l = nearest_int(x[i] * sl2 / slx);
1778
+ new_l = MIN(nmax, new_l);
1779
+ if (new_l != L[i]) {
1780
+ slx += w*x[i]*new_l;
1781
+ sl2 += w*new_l*new_l;
1782
+ if (slx*slx*suml2 > sumlx*sumlx*sl2) {
1783
+ L[i] = new_l; sumlx = slx; suml2 = sl2;
1784
+ ++n_changed;
1785
+ }
1786
+ }
1787
+ }
1788
+ }
1789
+ if (!n_changed) {
1790
+ break;
1791
+ }
1792
+ }
1793
+ return sumlx / suml2;
1794
+ }
1795
+
1796
+ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
1797
+ GGML_ASSERT(quant_weights);
1798
+ assert(k % QK_K == 0);
1799
+ const int nb = k / QK_K;
1800
+ const bool requantize = true;
1801
+
1802
+ uint8_t L[QK_K];
1803
+ uint8_t Laux[16];
1804
+ float mins[QK_K/16];
1805
+ float scales[QK_K/16];
1806
+ float sw[QK_K/16];
1807
+ float weight[QK_K/16];
1808
+ uint8_t Ls[QK_K/16], Lm[QK_K/16];
1809
+
1810
+ for (int i = 0; i < nb; i++) {
1811
+ memset(sw, 0, QK_K/16*sizeof(float));
1812
+ float sumx2 = 0;
1813
+ for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
1814
+ float sigma2 = sumx2/QK_K;
1815
+ for (int j = 0; j < QK_K/16; ++j) {
1816
+ const float * restrict qw = quant_weights + QK_K * i + 16*j;
1817
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1818
+ for (int l = 0; l < 16; ++l) sw[j] += weight[l];
1819
+ scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1820
+ }
1821
+
1822
+ float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1823
+ float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1824
+ y[i].d = GGML_FP32_TO_FP16(dm);
1825
+ y[i].dmin = GGML_FP32_TO_FP16(mm);
1826
+ dm = GGML_FP16_TO_FP32(y[i].d);
1827
+ mm = GGML_FP16_TO_FP32(y[i].dmin);
1828
+
1829
+ for (int j = 0; j < QK_K/16; ++j) {
1830
+ y[i].scales[j] = Ls[j] | (Lm[j] << 4);
1831
+ }
1832
+
1833
+ if (requantize) {
1834
+ for (int j = 0; j < QK_K/16; ++j) {
1835
+ const float d = dm * (y[i].scales[j] & 0xF);
1836
+ if (!d) continue;
1837
+ const float m = mm * (y[i].scales[j] >> 4);
1838
+ for (int ii = 0; ii < 16; ++ii) {
1839
+ int l = nearest_int((x[16*j + ii] + m)/d);
1840
+ l = MAX(0, MIN(3, l));
1841
+ L[16*j + ii] = l;
1842
+ }
1843
+ }
1844
+ }
1845
+
1846
+ #if QK_K == 256
1847
+ for (int j = 0; j < QK_K; j += 128) {
1848
+ for (int l = 0; l < 32; ++l) {
1849
+ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
1850
+ }
1851
+ }
1852
+ #else
1853
+ for (int l = 0; l < 16; ++l) {
1854
+ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
1855
+ }
1856
+ #endif
1857
+
1858
+ x += QK_K;
1859
+
1860
+ }
1861
+ }
1862
+
1863
+ size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1864
+ (void)hist;
1865
+ int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1866
+ if (!quant_weights) {
1867
+ quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1868
+ }
1869
+ else {
1870
+ char * qrow = (char *)dst;
1871
+ for (int row = 0; row < nrow; ++row) {
1872
+ quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1873
+ src += n_per_row;
1874
+ qrow += row_size;
1875
+ }
1876
+ }
1877
+ return nrow * row_size;
1878
+ }
1879
+
1611
1880
  //========================= 3-bit (de)-quantization
1612
1881
 
1613
1882
  void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
@@ -1821,6 +2090,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
1821
2090
  return (n/QK_K*sizeof(block_q3_K));
1822
2091
  }
1823
2092
 
2093
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2094
+ #if QK_K != 256
2095
+ (void)quant_weights;
2096
+ quantize_row_q3_K_reference(x, y, n_per_row);
2097
+ #else
2098
+ assert(n_per_row % QK_K == 0);
2099
+ const int nb = n_per_row / QK_K;
2100
+
2101
+ int8_t L[QK_K];
2102
+ float scales[QK_K / 16];
2103
+ float weight[16];
2104
+ float sw[QK_K / 16];
2105
+ int8_t Ls[QK_K / 16];
2106
+
2107
+ for (int i = 0; i < nb; i++) {
2108
+
2109
+ float sumx2 = 0;
2110
+ for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
2111
+ float sigma2 = 2*sumx2/QK_K;
2112
+
2113
+ for (int j = 0; j < QK_K/16; ++j) {
2114
+ if (quant_weights) {
2115
+ const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
2116
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
2117
+ } else {
2118
+ for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
2119
+ }
2120
+ float sumw = 0;
2121
+ for (int l = 0; l < 16; ++l) sumw += weight[l];
2122
+ sw[j] = sumw;
2123
+
2124
+ scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
2125
+
2126
+ }
2127
+
2128
+ memset(y[i].scales, 0, 12);
2129
+
2130
+ float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
2131
+ for (int j = 0; j < QK_K/16; ++j) {
2132
+ int l = Ls[j];
2133
+ if (j < 8) {
2134
+ y[i].scales[j] = l & 0xF;
2135
+ } else {
2136
+ y[i].scales[j-8] |= ((l & 0xF) << 4);
2137
+ }
2138
+ l >>= 4;
2139
+ y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
2140
+ }
2141
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2142
+
2143
+ int8_t sc;
2144
+ for (int j = 0; j < QK_K/16; ++j) {
2145
+ sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
2146
+ sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
2147
+ float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2148
+ if (!d) {
2149
+ continue;
2150
+ }
2151
+ for (int ii = 0; ii < 16; ++ii) {
2152
+ int l = nearest_int(x[16*j + ii]/d);
2153
+ l = MAX(-4, MIN(3, l));
2154
+ L[16*j + ii] = l + 4;
2155
+ }
2156
+ }
2157
+
2158
+ memset(y[i].hmask, 0, QK_K/8);
2159
+ // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
2160
+ int m = 0;
2161
+ uint8_t hm = 1;
2162
+ for (int j = 0; j < QK_K; ++j) {
2163
+ if (L[j] > 3) {
2164
+ y[i].hmask[m] |= hm;
2165
+ L[j] -= 4;
2166
+ }
2167
+ if (++m == QK_K/8) {
2168
+ m = 0; hm <<= 1;
2169
+ }
2170
+ }
2171
+ for (int j = 0; j < QK_K; j += 128) {
2172
+ for (int l = 0; l < 32; ++l) {
2173
+ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
2174
+ }
2175
+ }
2176
+
2177
+ x += QK_K;
2178
+ }
2179
+ #endif
2180
+ }
2181
+
2182
+ size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2183
+ (void)hist;
2184
+ int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2185
+ if (!quant_weights) {
2186
+ quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2187
+ }
2188
+ else {
2189
+ char * qrow = (char *)dst;
2190
+ for (int row = 0; row < nrow; ++row) {
2191
+ quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2192
+ src += n_per_row;
2193
+ qrow += row_size;
2194
+ }
2195
+ }
2196
+ return nrow * row_size;
2197
+ }
2198
+
1824
2199
  // ====================== 4-bit (de)-quantization
1825
2200
 
1826
2201
  void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
@@ -1986,36 +2361,38 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
1986
2361
  return (n/QK_K*sizeof(block_q4_K));
1987
2362
  }
1988
2363
 
1989
- // ====================== 5-bit (de)-quantization
1990
-
1991
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
1992
- assert(k % QK_K == 0);
1993
- const int nb = k / QK_K;
2364
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2365
+ #if QK_K != 256
2366
+ (void)quant_weights;
2367
+ quantize_row_q4_K_reference(x, y, n_per_row);
2368
+ #else
2369
+ assert(n_per_row % QK_K == 0);
2370
+ const int nb = n_per_row / QK_K;
1994
2371
 
1995
- #if QK_K == 256
1996
2372
  uint8_t L[QK_K];
2373
+ uint8_t Laux[32];
2374
+ float weights[32];
1997
2375
  float mins[QK_K/32];
1998
2376
  float scales[QK_K/32];
1999
- float weights[32];
2000
- uint8_t Laux[32];
2001
- #else
2002
- int8_t L[QK_K];
2003
- float scales[QK_K/16];
2004
- #endif
2005
2377
 
2006
2378
  for (int i = 0; i < nb; i++) {
2007
2379
 
2008
- #if QK_K == 256
2380
+ float sum_x2 = 0;
2381
+ for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2382
+ float sigma2 = sum_x2/QK_K;
2383
+ float av_x = sqrtf(sigma2);
2009
2384
 
2010
2385
  float max_scale = 0; // as we are deducting the min, scales are always positive
2011
2386
  float max_min = 0;
2012
2387
  for (int j = 0; j < QK_K/32; ++j) {
2013
- //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
2014
- float sum_x2 = 0;
2015
- for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
2016
- float av_x = sqrtf(sum_x2/32);
2017
- for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2018
- scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
2388
+ if (quant_weights) {
2389
+ const float * qw = quant_weights + QK_K*i + 32*j;
2390
+ for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
2391
+ } else {
2392
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2393
+ }
2394
+ scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2395
+ //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2019
2396
  float scale = scales[j];
2020
2397
  if (scale > max_scale) {
2021
2398
  max_scale = scale;
@@ -2053,18 +2430,118 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2053
2430
  const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2054
2431
  for (int ii = 0; ii < 32; ++ii) {
2055
2432
  int l = nearest_int((x[32*j + ii] + dm)/d);
2056
- l = MAX(0, MIN(31, l));
2433
+ l = MAX(0, MIN(15, l));
2057
2434
  L[32*j + ii] = l;
2058
2435
  }
2059
2436
  }
2437
+ uint8_t * q = y[i].qs;
2438
+ for (int j = 0; j < QK_K; j += 64) {
2439
+ for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
2440
+ q += 32;
2441
+ }
2060
2442
 
2061
- uint8_t * restrict qh = y[i].qh;
2062
- uint8_t * restrict ql = y[i].qs;
2063
- memset(qh, 0, QK_K/8);
2443
+ x += QK_K;
2064
2444
 
2065
- uint8_t m1 = 1, m2 = 2;
2066
- for (int n = 0; n < QK_K; n += 64) {
2067
- for (int j = 0; j < 32; ++j) {
2445
+ }
2446
+ #endif
2447
+ }
2448
+
2449
+ size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2450
+ (void)hist;
2451
+ int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2452
+ if (!quant_weights) {
2453
+ quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2454
+ }
2455
+ else {
2456
+ char * qrow = (char *)dst;
2457
+ for (int row = 0; row < nrow; ++row) {
2458
+ quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2459
+ src += n_per_row;
2460
+ qrow += row_size;
2461
+ }
2462
+ }
2463
+ return nrow * row_size;
2464
+ }
2465
+
2466
+ // ====================== 5-bit (de)-quantization
2467
+
2468
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
2469
+ assert(k % QK_K == 0);
2470
+ const int nb = k / QK_K;
2471
+
2472
+ #if QK_K == 256
2473
+ uint8_t L[QK_K];
2474
+ float mins[QK_K/32];
2475
+ float scales[QK_K/32];
2476
+ float weights[32];
2477
+ uint8_t Laux[32];
2478
+ #else
2479
+ int8_t L[QK_K];
2480
+ float scales[QK_K/16];
2481
+ #endif
2482
+
2483
+ for (int i = 0; i < nb; i++) {
2484
+
2485
+ #if QK_K == 256
2486
+
2487
+ float max_scale = 0; // as we are deducting the min, scales are always positive
2488
+ float max_min = 0;
2489
+ for (int j = 0; j < QK_K/32; ++j) {
2490
+ //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
2491
+ float sum_x2 = 0;
2492
+ for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
2493
+ float av_x = sqrtf(sum_x2/32);
2494
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2495
+ scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
2496
+ float scale = scales[j];
2497
+ if (scale > max_scale) {
2498
+ max_scale = scale;
2499
+ }
2500
+ float min = mins[j];
2501
+ if (min > max_min) {
2502
+ max_min = min;
2503
+ }
2504
+ }
2505
+
2506
+ float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2507
+ float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2508
+ for (int j = 0; j < QK_K/32; ++j) {
2509
+ uint8_t ls = nearest_int(inv_scale*scales[j]);
2510
+ uint8_t lm = nearest_int(inv_min*mins[j]);
2511
+ ls = MIN(63, ls);
2512
+ lm = MIN(63, lm);
2513
+ if (j < 4) {
2514
+ y[i].scales[j] = ls;
2515
+ y[i].scales[j+4] = lm;
2516
+ } else {
2517
+ y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
2518
+ y[i].scales[j-4] |= ((ls >> 4) << 6);
2519
+ y[i].scales[j-0] |= ((lm >> 4) << 6);
2520
+ }
2521
+ }
2522
+ y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2523
+ y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2524
+
2525
+ uint8_t sc, m;
2526
+ for (int j = 0; j < QK_K/32; ++j) {
2527
+ get_scale_min_k4(j, y[i].scales, &sc, &m);
2528
+ const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2529
+ if (!d) continue;
2530
+ const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2531
+ for (int ii = 0; ii < 32; ++ii) {
2532
+ int l = nearest_int((x[32*j + ii] + dm)/d);
2533
+ l = MAX(0, MIN(31, l));
2534
+ L[32*j + ii] = l;
2535
+ }
2536
+ }
2537
+
2538
+ uint8_t * restrict qh = y[i].qh;
2539
+ uint8_t * restrict ql = y[i].qs;
2540
+ memset(qh, 0, QK_K/8);
2541
+
2542
+ uint8_t m1 = 1, m2 = 2;
2543
+ for (int n = 0; n < QK_K; n += 64) {
2544
+ for (int j = 0; j < 32; ++j) {
2068
2545
  int l1 = L[n + j];
2069
2546
  if (l1 > 15) {
2070
2547
  l1 -= 16; qh[j] |= m1;
@@ -2081,7 +2558,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2081
2558
  #else
2082
2559
  float max_scale = 0, amax = 0;
2083
2560
  for (int j = 0; j < QK_K/16; ++j) {
2084
- scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
2561
+ scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
2085
2562
  float abs_scale = fabsf(scales[j]);
2086
2563
  if (abs_scale > amax) {
2087
2564
  amax = abs_scale;
@@ -2192,6 +2669,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
2192
2669
  return (n/QK_K*sizeof(block_q5_K));
2193
2670
  }
2194
2671
 
2672
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2673
+ #if QK_K != 256
2674
+ (void)quant_weights;
2675
+ quantize_row_q5_K_reference(x, y, n_per_row);
2676
+ #else
2677
+ assert(n_per_row % QK_K == 0);
2678
+ const int nb = n_per_row / QK_K;
2679
+
2680
+ uint8_t L[QK_K];
2681
+ float mins[QK_K/32];
2682
+ float scales[QK_K/32];
2683
+ float weights[32];
2684
+ uint8_t Laux[32];
2685
+
2686
+ for (int i = 0; i < nb; i++) {
2687
+
2688
+ float sum_x2 = 0;
2689
+ for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2690
+ float sigma2 = sum_x2/QK_K;
2691
+ float av_x = sqrtf(sigma2);
2692
+
2693
+ float max_scale = 0; // as we are deducting the min, scales are always positive
2694
+ float max_min = 0;
2695
+ for (int j = 0; j < QK_K/32; ++j) {
2696
+ if (quant_weights) {
2697
+ const float * qw = quant_weights + QK_K*i + 32*j;
2698
+ for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
2699
+ } else {
2700
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2701
+ }
2702
+ scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2703
+ float scale = scales[j];
2704
+ if (scale > max_scale) {
2705
+ max_scale = scale;
2706
+ }
2707
+ float min = mins[j];
2708
+ if (min > max_min) {
2709
+ max_min = min;
2710
+ }
2711
+ }
2712
+
2713
+ float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2714
+ float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2715
+ for (int j = 0; j < QK_K/32; ++j) {
2716
+ uint8_t ls = nearest_int(inv_scale*scales[j]);
2717
+ uint8_t lm = nearest_int(inv_min*mins[j]);
2718
+ ls = MIN(63, ls);
2719
+ lm = MIN(63, lm);
2720
+ if (j < 4) {
2721
+ y[i].scales[j] = ls;
2722
+ y[i].scales[j+4] = lm;
2723
+ } else {
2724
+ y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
2725
+ y[i].scales[j-4] |= ((ls >> 4) << 6);
2726
+ y[i].scales[j-0] |= ((lm >> 4) << 6);
2727
+ }
2728
+ }
2729
+ y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2730
+ y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2731
+
2732
+ uint8_t sc, m;
2733
+ for (int j = 0; j < QK_K/32; ++j) {
2734
+ get_scale_min_k4(j, y[i].scales, &sc, &m);
2735
+ const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2736
+ if (!d) continue;
2737
+ const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2738
+ for (int ii = 0; ii < 32; ++ii) {
2739
+ int l = nearest_int((x[32*j + ii] + dm)/d);
2740
+ l = MAX(0, MIN(31, l));
2741
+ L[32*j + ii] = l;
2742
+ }
2743
+ }
2744
+
2745
+ uint8_t * restrict qh = y[i].qh;
2746
+ uint8_t * restrict ql = y[i].qs;
2747
+ memset(qh, 0, QK_K/8);
2748
+
2749
+ uint8_t m1 = 1, m2 = 2;
2750
+ for (int n = 0; n < QK_K; n += 64) {
2751
+ for (int j = 0; j < 32; ++j) {
2752
+ int l1 = L[n + j];
2753
+ if (l1 > 15) {
2754
+ l1 -= 16; qh[j] |= m1;
2755
+ }
2756
+ int l2 = L[n + j + 32];
2757
+ if (l2 > 15) {
2758
+ l2 -= 16; qh[j] |= m2;
2759
+ }
2760
+ ql[j] = l1 | (l2 << 4);
2761
+ }
2762
+ m1 <<= 2; m2 <<= 2;
2763
+ ql += 32;
2764
+ }
2765
+
2766
+ x += QK_K;
2767
+
2768
+ }
2769
+ #endif
2770
+ }
2771
+
2772
+ size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2773
+ (void)hist;
2774
+ int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2775
+ if (!quant_weights) {
2776
+ quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2777
+ }
2778
+ else {
2779
+ char * qrow = (char *)dst;
2780
+ for (int row = 0; row < nrow; ++row) {
2781
+ quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2782
+ src += n_per_row;
2783
+ qrow += row_size;
2784
+ }
2785
+ }
2786
+ return nrow * row_size;
2787
+ }
2788
+
2195
2789
  // ====================== 6-bit (de)-quantization
2196
2790
 
2197
2791
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
@@ -2208,7 +2802,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2208
2802
 
2209
2803
  for (int ib = 0; ib < QK_K/16; ++ib) {
2210
2804
 
2211
- const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
2805
+ const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
2212
2806
  scales[ib] = scale;
2213
2807
 
2214
2808
  const float abs_scale = fabsf(scale);
@@ -2276,68 +2870,365 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2276
2870
  }
2277
2871
  }
2278
2872
 
2279
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2280
- assert(k % QK_K == 0);
2281
- const int nb = k / QK_K;
2873
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2874
+ assert(k % QK_K == 0);
2875
+ const int nb = k / QK_K;
2876
+
2877
+ for (int i = 0; i < nb; i++) {
2878
+
2879
+ const float d = GGML_FP16_TO_FP32(x[i].d);
2880
+
2881
+ const uint8_t * restrict ql = x[i].ql;
2882
+ const uint8_t * restrict qh = x[i].qh;
2883
+ const int8_t * restrict sc = x[i].scales;
2884
+
2885
+ #if QK_K == 256
2886
+ for (int n = 0; n < QK_K; n += 128) {
2887
+ for (int l = 0; l < 32; ++l) {
2888
+ int is = l/16;
2889
+ const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2890
+ const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2891
+ const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2892
+ const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2893
+ y[l + 0] = d * sc[is + 0] * q1;
2894
+ y[l + 32] = d * sc[is + 2] * q2;
2895
+ y[l + 64] = d * sc[is + 4] * q3;
2896
+ y[l + 96] = d * sc[is + 6] * q4;
2897
+ }
2898
+ y += 128;
2899
+ ql += 64;
2900
+ qh += 32;
2901
+ sc += 8;
2902
+ }
2903
+ #else
2904
+ for (int l = 0; l < 16; ++l) {
2905
+ const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2906
+ const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2907
+ const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2908
+ const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2909
+ y[l+ 0] = d * sc[0] * q1;
2910
+ y[l+16] = d * sc[1] * q2;
2911
+ y[l+32] = d * sc[2] * q3;
2912
+ y[l+48] = d * sc[3] * q4;
2913
+ }
2914
+ y += 64;
2915
+ #endif
2916
+
2917
+ }
2918
+ }
2919
+
2920
+ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2921
+ assert(k % QK_K == 0);
2922
+ block_q6_K * restrict y = vy;
2923
+ quantize_row_q6_K_reference(x, y, k);
2924
+ }
2925
+
2926
+ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
2927
+ assert(k % QK_K == 0);
2928
+ (void)hist; // TODO: collect histograms
2929
+
2930
+ for (int j = 0; j < n; j += k) {
2931
+ block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
2932
+ quantize_row_q6_K_reference(src + j, y, k);
2933
+ }
2934
+ return (n/QK_K*sizeof(block_q6_K));
2935
+ }
2936
+
2937
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2938
+ #if QK_K != 256
2939
+ (void)quant_weights;
2940
+ quantize_row_q6_K_reference(x, y, n_per_row);
2941
+ #else
2942
+ assert(n_per_row % QK_K == 0);
2943
+ const int nb = n_per_row / QK_K;
2944
+
2945
+ int8_t L[QK_K];
2946
+ float scales[QK_K/16];
2947
+ //float weights[16];
2948
+
2949
+ for (int i = 0; i < nb; i++) {
2950
+
2951
+ //float sum_x2 = 0;
2952
+ //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
2953
+ //float sigma2 = sum_x2/QK_K;
2954
+
2955
+ float max_scale = 0;
2956
+ float max_abs_scale = 0;
2957
+
2958
+ for (int ib = 0; ib < QK_K/16; ++ib) {
2959
+
2960
+ float scale;
2961
+ if (quant_weights) {
2962
+ const float * qw = quant_weights + QK_K*i + 16*ib;
2963
+ //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
2964
+ //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
2965
+ scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
2966
+ } else {
2967
+ scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
2968
+ }
2969
+ scales[ib] = scale;
2970
+
2971
+ const float abs_scale = fabsf(scale);
2972
+ if (abs_scale > max_abs_scale) {
2973
+ max_abs_scale = abs_scale;
2974
+ max_scale = scale;
2975
+ }
2976
+
2977
+ }
2978
+
2979
+ if (!max_abs_scale) {
2980
+ memset(&y[i], 0, sizeof(block_q6_K));
2981
+ y[i].d = GGML_FP32_TO_FP16(0.f);
2982
+ x += QK_K;
2983
+ continue;
2984
+ }
2985
+
2986
+ float iscale = -128.f/max_scale;
2987
+ y[i].d = GGML_FP32_TO_FP16(1/iscale);
2988
+ for (int ib = 0; ib < QK_K/16; ++ib) {
2989
+ y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
2990
+ }
2991
+
2992
+ for (int j = 0; j < QK_K/16; ++j) {
2993
+ float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
2994
+ if (!d) {
2995
+ continue;
2996
+ }
2997
+ for (int ii = 0; ii < 16; ++ii) {
2998
+ int l = nearest_int(x[16*j + ii]/d);
2999
+ l = MAX(-32, MIN(31, l));
3000
+ L[16*j + ii] = l + 32;
3001
+ }
3002
+ }
3003
+
3004
+ uint8_t * restrict ql = y[i].ql;
3005
+ uint8_t * restrict qh = y[i].qh;
3006
+ for (int j = 0; j < QK_K; j += 128) {
3007
+ for (int l = 0; l < 32; ++l) {
3008
+ const uint8_t q1 = L[j + l + 0] & 0xF;
3009
+ const uint8_t q2 = L[j + l + 32] & 0xF;
3010
+ const uint8_t q3 = L[j + l + 64] & 0xF;
3011
+ const uint8_t q4 = L[j + l + 96] & 0xF;
3012
+ ql[l+ 0] = q1 | (q3 << 4);
3013
+ ql[l+32] = q2 | (q4 << 4);
3014
+ qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
3015
+ }
3016
+ ql += 64;
3017
+ qh += 32;
3018
+ }
3019
+
3020
+ x += QK_K;
3021
+
3022
+ }
3023
+ #endif
3024
+ }
3025
+
3026
+ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3027
+ (void)hist;
3028
+ int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3029
+ if (!quant_weights) {
3030
+ quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3031
+ }
3032
+ else {
3033
+ char * qrow = (char *)dst;
3034
+ for (int row = 0; row < nrow; ++row) {
3035
+ quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3036
+ src += n_per_row;
3037
+ qrow += row_size;
3038
+ }
3039
+ }
3040
+ return nrow * row_size;
3041
+ }
3042
+
3043
+ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3044
+ static_assert(QK4_0 == 32, "QK4_0 must be 32");
3045
+
3046
+ if (!quant_weights) {
3047
+ quantize_row_q4_0_reference(x, y, n_per_row);
3048
+ return;
3049
+ }
3050
+
3051
+ float weight[QK4_0];
3052
+ int8_t L[QK4_0];
3053
+
3054
+ float sum_x2 = 0;
3055
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3056
+ float sigma2 = sum_x2/n_per_row;
3057
+
3058
+ const int nb = n_per_row/QK4_0;
3059
+ for (int ib = 0; ib < nb; ++ib) {
3060
+ const float * xb = x + QK4_0 * ib;
3061
+ const float * qw = quant_weights + QK4_0 * ib;
3062
+ for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3063
+ float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
3064
+ y[ib].d = GGML_FP32_TO_FP16(d);
3065
+ for (int j = 0; j < 16; ++j) {
3066
+ y[ib].qs[j] = L[j] | (L[j+16] << 4);
3067
+ }
3068
+ }
3069
+ }
3070
+
3071
+ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3072
+ if (!quant_weights) {
3073
+ return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3074
+ }
3075
+ int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3076
+ char * qrow = (char *)dst;
3077
+ for (int row = 0; row < nrow; ++row) {
3078
+ quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3079
+ src += n_per_row;
3080
+ qrow += row_size;
3081
+ }
3082
+ return nrow * row_size;
3083
+ }
3084
+
3085
+ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3086
+ static_assert(QK4_1 == 32, "QK4_1 must be 32");
2282
3087
 
2283
- for (int i = 0; i < nb; i++) {
3088
+ if (!quant_weights) {
3089
+ quantize_row_q4_1_reference(x, y, n_per_row);
3090
+ return;
3091
+ }
2284
3092
 
2285
- const float d = GGML_FP16_TO_FP32(x[i].d);
3093
+ float weight[QK4_1];
3094
+ uint8_t L[QK4_1], Laux[QK4_1];
2286
3095
 
2287
- const uint8_t * restrict ql = x[i].ql;
2288
- const uint8_t * restrict qh = x[i].qh;
2289
- const int8_t * restrict sc = x[i].scales;
3096
+ float sum_x2 = 0;
3097
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3098
+ float sigma2 = sum_x2/n_per_row;
2290
3099
 
2291
- #if QK_K == 256
2292
- for (int n = 0; n < QK_K; n += 128) {
2293
- for (int l = 0; l < 32; ++l) {
2294
- int is = l/16;
2295
- const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2296
- const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2297
- const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2298
- const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2299
- y[l + 0] = d * sc[is + 0] * q1;
2300
- y[l + 32] = d * sc[is + 2] * q2;
2301
- y[l + 64] = d * sc[is + 4] * q3;
2302
- y[l + 96] = d * sc[is + 6] * q4;
2303
- }
2304
- y += 128;
2305
- ql += 64;
2306
- qh += 32;
2307
- sc += 8;
3100
+ const int nb = n_per_row/QK4_1;
3101
+ for (int ib = 0; ib < nb; ++ib) {
3102
+ const float * xb = x + QK4_1 * ib;
3103
+ const float * qw = quant_weights + QK4_1 * ib;
3104
+ for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3105
+ float min;
3106
+ float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
3107
+ y[ib].d = GGML_FP32_TO_FP16(d);
3108
+ y[ib].m = GGML_FP32_TO_FP16(-min);
3109
+ for (int j = 0; j < 16; ++j) {
3110
+ y[ib].qs[j] = L[j] | (L[j+16] << 4);
2308
3111
  }
2309
- #else
2310
- for (int l = 0; l < 16; ++l) {
2311
- const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2312
- const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2313
- const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2314
- const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2315
- y[l+ 0] = d * sc[0] * q1;
2316
- y[l+16] = d * sc[1] * q2;
2317
- y[l+32] = d * sc[2] * q3;
2318
- y[l+48] = d * sc[3] * q4;
3112
+ }
3113
+ }
3114
+
3115
+ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3116
+ if (!quant_weights) {
3117
+ return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3118
+ }
3119
+ int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3120
+ char * qrow = (char *)dst;
3121
+ for (int row = 0; row < nrow; ++row) {
3122
+ quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3123
+ src += n_per_row;
3124
+ qrow += row_size;
3125
+ }
3126
+ return nrow * row_size;
3127
+ }
3128
+
3129
+ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3130
+ static_assert(QK5_0 == 32, "QK5_0 must be 32");
3131
+
3132
+ if (!quant_weights) {
3133
+ quantize_row_q5_0_reference(x, y, n_per_row);
3134
+ return;
3135
+ }
3136
+
3137
+ float weight[QK5_0];
3138
+ int8_t L[QK5_0];
3139
+
3140
+ float sum_x2 = 0;
3141
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
+ float sigma2 = sum_x2/n_per_row;
3143
+
3144
+ const int nb = n_per_row/QK5_0;
3145
+ for (int ib = 0; ib < nb; ++ib) {
3146
+ const float * xb = x + QK5_0 * ib;
3147
+ const float * qw = quant_weights + QK5_0 * ib;
3148
+ for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3149
+ float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
3150
+ y[ib].d = GGML_FP32_TO_FP16(d);
3151
+
3152
+ uint32_t qh = 0;
3153
+
3154
+ for (int j = 0; j < 16; ++j) {
3155
+ const uint8_t xi0 = L[j];
3156
+ const uint8_t xi1 = L[j+16];
3157
+ y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
3158
+
3159
+ // get the 5-th bit and store it in qh at the right position
3160
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
3161
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
2319
3162
  }
2320
- y += 64;
2321
- #endif
2322
3163
 
3164
+ memcpy(&y[ib].qh, &qh, sizeof(qh));
2323
3165
  }
2324
3166
  }
2325
3167
 
2326
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2327
- assert(k % QK_K == 0);
2328
- block_q6_K * restrict y = vy;
2329
- quantize_row_q6_K_reference(x, y, k);
3168
+ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3169
+ if (!quant_weights) {
3170
+ return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3171
+ }
3172
+ int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3173
+ char * qrow = (char *)dst;
3174
+ for (int row = 0; row < nrow; ++row) {
3175
+ quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3176
+ src += n_per_row;
3177
+ qrow += row_size;
3178
+ }
3179
+ return nrow * row_size;
2330
3180
  }
2331
3181
 
2332
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
2333
- assert(k % QK_K == 0);
2334
- (void)hist; // TODO: collect histograms
3182
+ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3183
+ static_assert(QK5_1 == 32, "QK5_1 must be 32");
2335
3184
 
2336
- for (int j = 0; j < n; j += k) {
2337
- block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
2338
- quantize_row_q6_K_reference(src + j, y, k);
3185
+ if (!quant_weights) {
3186
+ quantize_row_q5_1_reference(x, y, n_per_row);
3187
+ return;
2339
3188
  }
2340
- return (n/QK_K*sizeof(block_q6_K));
3189
+
3190
+ float weight[QK5_1];
3191
+ uint8_t L[QK5_1], Laux[QK5_1];
3192
+
3193
+ float sum_x2 = 0;
3194
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3195
+ float sigma2 = sum_x2/n_per_row;
3196
+
3197
+ const int nb = n_per_row/QK5_1;
3198
+ for (int ib = 0; ib < nb; ++ib) {
3199
+ const float * xb = x + QK5_1 * ib;
3200
+ const float * qw = quant_weights + QK5_1 * ib;
3201
+ for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3202
+ float min;
3203
+ float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
3204
+ y[ib].d = GGML_FP32_TO_FP16(d);
3205
+ y[ib].m = GGML_FP32_TO_FP16(-min);
3206
+
3207
+ uint32_t qh = 0;
3208
+ for (int j = 0; j < 16; ++j) {
3209
+ const uint8_t xi0 = L[j];
3210
+ const uint8_t xi1 = L[j+16];
3211
+ y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
3212
+ // get the 5-th bit and store it in qh at the right position
3213
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
3214
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
3215
+ }
3216
+ memcpy(&y[ib].qh, &qh, sizeof(qh));
3217
+ }
3218
+ }
3219
+
3220
+ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3221
+ if (!quant_weights) {
3222
+ return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3223
+ }
3224
+ int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3225
+ char * qrow = (char *)dst;
3226
+ for (int row = 0; row < nrow; ++row) {
3227
+ quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3228
+ src += n_per_row;
3229
+ qrow += row_size;
3230
+ }
3231
+ return nrow * row_size;
2341
3232
  }
2342
3233
 
2343
3234
  // ====================== "True" 2-bit (de)-quantization
@@ -2553,14 +3444,6 @@ static const uint8_t ksigns_iq2xs[128] = {
2553
3444
 
2554
3445
  static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2555
3446
 
2556
- void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
2557
- (void)x;
2558
- (void)y;
2559
- (void)k;
2560
- assert(k % QK_K == 0);
2561
- //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2562
- }
2563
-
2564
3447
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
2565
3448
  assert(k % QK_K == 0);
2566
3449
  const int nb = k / QK_K;
@@ -2587,33 +3470,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
2587
3470
  }
2588
3471
  }
2589
3472
 
2590
- void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
2591
- assert(k % QK_K == 0);
2592
- block_iq2_xxs * restrict y = vy;
2593
- quantize_row_iq2_xxs_reference(x, y, k);
2594
- }
2595
-
2596
- size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
2597
- assert(k % QK_K == 0);
2598
- (void)hist; // TODO: collect histograms
2599
-
2600
- for (int j = 0; j < n; j += k) {
2601
- block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
2602
- quantize_row_iq2_xxs_reference(src + j, y, k);
2603
- }
2604
- return (n/QK_K*sizeof(block_iq2_xxs));
2605
- }
2606
-
2607
3473
  // ====================== 2.3125 bpw (de)-quantization
2608
3474
 
2609
- void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
2610
- (void)x;
2611
- (void)y;
2612
- (void)k;
2613
- assert(k % QK_K == 0);
2614
- //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2615
- }
2616
-
2617
3475
  void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
2618
3476
  assert(k % QK_K == 0);
2619
3477
  const int nb = k / QK_K;
@@ -2639,23 +3497,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
2639
3497
  }
2640
3498
  }
2641
3499
 
2642
- void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
2643
- assert(k % QK_K == 0);
2644
- block_iq2_xs * restrict y = vy;
2645
- quantize_row_iq2_xs_reference(x, y, k);
2646
- }
2647
-
2648
- size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
2649
- assert(k % QK_K == 0);
2650
- (void)hist; // TODO: collect histograms
2651
-
2652
- for (int j = 0; j < n; j += k) {
2653
- block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
2654
- quantize_row_iq2_xs_reference(src + j, y, k);
2655
- }
2656
- return (n/QK_K*sizeof(block_iq2_xs));
2657
- }
2658
-
2659
3500
  //===================================== Q8_K ==============================================
2660
3501
 
2661
3502
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -7554,9 +8395,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7554
8395
 
7555
8396
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7556
8397
 
7557
- int8x16x4_t q2u;
7558
- int8x16x4_t q2s;
7559
- int8x16x4_t q8b;
8398
+ ggml_int8x16x4_t q2u;
8399
+ ggml_int8x16x4_t q2s;
8400
+ ggml_int8x16x4_t q8b;
7560
8401
 
7561
8402
  int32x4x4_t scales32;
7562
8403
 
@@ -7578,7 +8419,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7578
8419
  scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
7579
8420
  int32x4_t sumi = vdupq_n_s32(0);
7580
8421
  for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
7581
- q8b = vld1q_s8_x4(q8); q8 += 64;
8422
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
7582
8423
  q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
7583
8424
  q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
7584
8425
  q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
@@ -7699,3 +8540,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7699
8540
  *s = 0.125f * sumf;
7700
8541
  #endif
7701
8542
  }
8543
+
8544
+ // ================================ IQ2 quantization =============================================
8545
+
8546
+ typedef struct {
8547
+ uint64_t * grid;
8548
+ int * map;
8549
+ uint16_t * neighbours;
8550
+ } iq2_entry_t;
8551
+
8552
+ static iq2_entry_t iq2_data[2] = {
8553
+ {NULL, NULL, NULL},
8554
+ {NULL, NULL, NULL},
8555
+ };
8556
+
8557
+ static inline int iq2_data_index(int grid_size) {
8558
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
8559
+ return grid_size == 256 ? 0 : 1;
8560
+ }
8561
+
8562
+ static int iq2_compare_func(const void * left, const void * right) {
8563
+ const int * l = (const int *)left;
8564
+ const int * r = (const int *)right;
8565
+ return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
8566
+ }
8567
+
8568
+ static void q2xs_init_impl(int grid_size) {
8569
+ const int gindex = iq2_data_index(grid_size);
8570
+ if (iq2_data[gindex].grid) {
8571
+ return;
8572
+ }
8573
+ static const uint16_t kgrid_256[256] = {
8574
+ 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
8575
+ 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
8576
+ 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
8577
+ 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
8578
+ 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
8579
+ 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
8580
+ 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
8581
+ 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
8582
+ 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
8583
+ 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
8584
+ 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
8585
+ 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
8586
+ 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
8587
+ 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
8588
+ 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
8589
+ 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
8590
+ };
8591
+ static const uint16_t kgrid_512[512] = {
8592
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
8593
+ 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
8594
+ 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
8595
+ 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
8596
+ 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
8597
+ 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
8598
+ 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
8599
+ 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
8600
+ 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
8601
+ 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
8602
+ 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
8603
+ 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
8604
+ 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
8605
+ 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
8606
+ 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
8607
+ 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
8608
+ 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
8609
+ 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
8610
+ 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
8611
+ 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
8612
+ 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
8613
+ 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
8614
+ 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
8615
+ 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
8616
+ 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
8617
+ 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
8618
+ 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
8619
+ 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
8620
+ 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
8621
+ 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
8622
+ 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
8623
+ 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
8624
+ };
8625
+ const int kmap_size = 43692;
8626
+ const int nwant = 2;
8627
+ const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
8628
+ uint64_t * kgrid_q2xs;
8629
+ int * kmap_q2xs;
8630
+ uint16_t * kneighbors_q2xs;
8631
+
8632
+ printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
8633
+ uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
8634
+ for (int k = 0; k < grid_size; ++k) {
8635
+ int8_t * pos = (int8_t *)(the_grid + k);
8636
+ for (int i = 0; i < 8; ++i) {
8637
+ int l = (kgrid[k] >> 2*i) & 0x3;
8638
+ pos[i] = 2*l + 1;
8639
+ }
8640
+ }
8641
+ kgrid_q2xs = the_grid;
8642
+ iq2_data[gindex].grid = the_grid;
8643
+ kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
8644
+ iq2_data[gindex].map = kmap_q2xs;
8645
+ for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
8646
+ uint64_t aux64;
8647
+ uint8_t * aux8 = (uint8_t *)&aux64;
8648
+ for (int i = 0; i < grid_size; ++i) {
8649
+ aux64 = kgrid_q2xs[i];
8650
+ uint16_t index = 0;
8651
+ for (int k=0; k<8; ++k) {
8652
+ uint16_t q = (aux8[k] - 1)/2;
8653
+ index |= (q << 2*k);
8654
+ }
8655
+ kmap_q2xs[index] = i;
8656
+ }
8657
+ int8_t pos[8];
8658
+ int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
8659
+ int num_neighbors = 0, num_not_in_map = 0;
8660
+ for (int i = 0; i < kmap_size; ++i) {
8661
+ if (kmap_q2xs[i] >= 0) continue;
8662
+ ++num_not_in_map;
8663
+ for (int k = 0; k < 8; ++k) {
8664
+ int l = (i >> 2*k) & 0x3;
8665
+ pos[k] = 2*l + 1;
8666
+ }
8667
+ for (int j = 0; j < grid_size; ++j) {
8668
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
8669
+ int d2 = 0;
8670
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
8671
+ dist2[2*j+0] = d2;
8672
+ dist2[2*j+1] = j;
8673
+ }
8674
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
8675
+ int n = 0; int d2 = dist2[0];
8676
+ int nhave = 1;
8677
+ for (int j = 0; j < grid_size; ++j) {
8678
+ if (dist2[2*j] > d2) {
8679
+ if (nhave == nwant) break;
8680
+ d2 = dist2[2*j];
8681
+ ++nhave;
8682
+ }
8683
+ ++n;
8684
+ }
8685
+ num_neighbors += n;
8686
+ }
8687
+ printf("%s: %d neighbours in total\n", __func__, num_neighbors);
8688
+ kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
8689
+ iq2_data[gindex].neighbours = kneighbors_q2xs;
8690
+ int counter = 0;
8691
+ for (int i = 0; i < kmap_size; ++i) {
8692
+ if (kmap_q2xs[i] >= 0) continue;
8693
+ for (int k = 0; k < 8; ++k) {
8694
+ int l = (i >> 2*k) & 0x3;
8695
+ pos[k] = 2*l + 1;
8696
+ }
8697
+ for (int j = 0; j < grid_size; ++j) {
8698
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
8699
+ int d2 = 0;
8700
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
8701
+ dist2[2*j+0] = d2;
8702
+ dist2[2*j+1] = j;
8703
+ }
8704
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
8705
+ kmap_q2xs[i] = -(counter + 1);
8706
+ int d2 = dist2[0];
8707
+ uint16_t * start = &kneighbors_q2xs[counter++];
8708
+ int n = 0, nhave = 1;
8709
+ for (int j = 0; j < grid_size; ++j) {
8710
+ if (dist2[2*j] > d2) {
8711
+ if (nhave == nwant) break;
8712
+ d2 = dist2[2*j];
8713
+ ++nhave;
8714
+ }
8715
+ kneighbors_q2xs[counter++] = dist2[2*j+1];
8716
+ ++n;
8717
+ }
8718
+ *start = n;
8719
+ }
8720
+ free(dist2);
8721
+ }
8722
+
8723
+ void ggml_init_iq2_quantization(enum ggml_type type) {
8724
+ if (type == GGML_TYPE_IQ2_XXS) {
8725
+ q2xs_init_impl(256);
8726
+ }
8727
+ else if (type == GGML_TYPE_IQ2_XS) {
8728
+ q2xs_init_impl(512);
8729
+ }
8730
+ else {
8731
+ fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8732
+ }
8733
+ }
8734
+
8735
+ static void q2xs_deinit_impl(int grid_size) {
8736
+ GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
8737
+ const int gindex = iq2_data_index(grid_size);
8738
+ if (iq2_data[gindex].grid) {
8739
+ free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
8740
+ free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
8741
+ free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
8742
+ }
8743
+ }
8744
+
8745
+ void ggml_deinit_iq2_quantization(enum ggml_type type) {
8746
+ if (type == GGML_TYPE_IQ2_XXS) {
8747
+ q2xs_deinit_impl(256);
8748
+ }
8749
+ else if (type == GGML_TYPE_IQ2_XS) {
8750
+ q2xs_deinit_impl(512);
8751
+ }
8752
+ else {
8753
+ fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8754
+ }
8755
+ }
8756
+
8757
+ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
8758
+ const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
8759
+ int num_neighbors = neighbours[0];
8760
+ GGML_ASSERT(num_neighbors > 0);
8761
+ float best_d2 = FLT_MAX;
8762
+ int grid_index = -1;
8763
+ for (int j = 1; j <= num_neighbors; ++j) {
8764
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
8765
+ float d2 = 0;
8766
+ for (int i = 0; i < 8; ++i) {
8767
+ float q = pg[i];
8768
+ float diff = scale*q - xval[i];
8769
+ d2 += weight[i]*diff*diff;
8770
+ }
8771
+ if (d2 < best_d2) {
8772
+ best_d2 = d2; grid_index = neighbours[j];
8773
+ }
8774
+ }
8775
+ GGML_ASSERT(grid_index >= 0);
8776
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
8777
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
8778
+ return grid_index;
8779
+ }
8780
+
8781
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
8782
+
8783
+ const int gindex = iq2_data_index(256);
8784
+
8785
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
8786
+ const int * kmap_q2xs = iq2_data[gindex].map;
8787
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8788
+
8789
+ GGML_ASSERT(quant_weights);
8790
+ GGML_ASSERT(kgrid_q2xs);
8791
+ GGML_ASSERT(kmap_q2xs);
8792
+ GGML_ASSERT(kneighbors_q2xs);
8793
+ GGML_ASSERT(n%QK_K == 0);
8794
+
8795
+ const int kMaxQ = 3;
8796
+
8797
+ const int nbl = n/256;
8798
+
8799
+ block_iq2_xxs * y = vy;
8800
+
8801
+ float scales[QK_K/32];
8802
+ float weight[32];
8803
+ float xval[32];
8804
+ int8_t L[32];
8805
+ int8_t Laux[32];
8806
+ float waux[32];
8807
+ bool is_on_grid[4];
8808
+ bool is_on_grid_aux[4];
8809
+ uint8_t block_signs[4];
8810
+ uint32_t q2[2*(QK_K/32)];
8811
+
8812
+ for (int ibl = 0; ibl < nbl; ++ibl) {
8813
+
8814
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
8815
+ memset(q2, 0, QK_K/4);
8816
+
8817
+ float max_scale = 0;
8818
+
8819
+ const float * xbl = x + QK_K*ibl;
8820
+ float sumx2 = 0;
8821
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
8822
+ float sigma2 = sumx2/QK_K;
8823
+
8824
+ for (int ib = 0; ib < QK_K/32; ++ib) {
8825
+ const float * xb = xbl + 32*ib;
8826
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
8827
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8828
+ for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
8829
+ for (int k = 0; k < 4; ++k) {
8830
+ int nflip = 0;
8831
+ uint8_t s = 0;
8832
+ for (int i = 0; i < 8; ++i) {
8833
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
8834
+ else {
8835
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
8836
+ }
8837
+ }
8838
+ if (nflip%2) {
8839
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
8840
+ for (int i = 1; i < 8; ++i) {
8841
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
8842
+ if (ax < min) {
8843
+ min = ax; imin = i;
8844
+ }
8845
+ }
8846
+ xval[8*k+imin] = -xval[8*k+imin];
8847
+ s ^= (1 << imin);
8848
+ }
8849
+ block_signs[k] = s & 127;
8850
+ }
8851
+ float max = xval[0];
8852
+ for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
8853
+ if (!max) {
8854
+ scales[ib] = 0;
8855
+ memset(L, 0, 32);
8856
+ continue;
8857
+ }
8858
+ float best = 0;
8859
+ float scale = max/(2*kMaxQ-1);
8860
+ for (int is = -9; is <= 9; ++is) {
8861
+ float id = (2*kMaxQ-1+is*0.1f)/max;
8862
+ float this_scale = 1/id;
8863
+ for (int k = 0; k < 4; ++k) {
8864
+ for (int i = 0; i < 8; ++i) {
8865
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8866
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
8867
+ }
8868
+ uint16_t u = 0;
8869
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
8870
+ int grid_index = kmap_q2xs[u];
8871
+ is_on_grid_aux[k] = true;
8872
+ if (grid_index < 0) {
8873
+ is_on_grid_aux[k] = false;
8874
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8875
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
8876
+ }
8877
+ }
8878
+ float sumqx = 0, sumq2 = 0;
8879
+ for (int i = 0; i < 32; ++i) {
8880
+ float w = weight[i];
8881
+ float q = 2*Laux[i] + 1;
8882
+ sumqx += w*xval[i]*q;
8883
+ sumq2 += w*q*q;
8884
+ }
8885
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
8886
+ scale = sumqx/sumq2; best = scale*sumqx;
8887
+ for (int i = 0; i < 32; ++i) L[i] = Laux[i];
8888
+ for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
8889
+ }
8890
+ }
8891
+ int n_not_ongrid = 0;
8892
+ for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
8893
+ if (n_not_ongrid > 0 && scale > 0) {
8894
+ float id = 1/scale;
8895
+ for (int k = 0; k < 4; ++k) {
8896
+ if (is_on_grid[k]) continue;
8897
+ uint16_t u = 0;
8898
+ for (int i = 0; i < 8; ++i) {
8899
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8900
+ l = MAX(0, MIN(kMaxQ-1, l));
8901
+ u |= (l << 2*i);
8902
+ }
8903
+ int grid_index = kmap_q2xs[u];
8904
+ if (grid_index < 0) {
8905
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8906
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
8907
+ }
8908
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
8909
+ for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
8910
+ }
8911
+ float sumqx = 0, sumq2 = 0;
8912
+ for (int i = 0; i < 32; ++i) {
8913
+ float w = weight[i];
8914
+ float q = 2*L[i] + 1;
8915
+ sumqx += w*xval[i]*q;
8916
+ sumq2 += w*q*q;
8917
+ }
8918
+ if (sumq2 > 0) scale = sumqx/sumq2;
8919
+ }
8920
+ if (scale < 0) {
8921
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
8922
+ // and correspondingly flip quant signs.
8923
+ scale = -scale;
8924
+ for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
8925
+ }
8926
+ for (int k = 0; k < 4; ++k) {
8927
+ uint16_t u = 0;
8928
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
8929
+ int grid_index = kmap_q2xs[u];
8930
+ if (grid_index < 0) {
8931
+ printf("Oops: found point %u not on grid:", u);
8932
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
8933
+ printf("\n");
8934
+ GGML_ASSERT(false);
8935
+ }
8936
+ q2[2*ib+0] |= (grid_index << 8*k);
8937
+ q2[2*ib+1] |= (block_signs[k] << 7*k);
8938
+ }
8939
+ GGML_ASSERT(scale >= 0);
8940
+ scales[ib] = scale;
8941
+ max_scale = MAX(max_scale, scale);
8942
+ }
8943
+
8944
+ if (!max_scale) {
8945
+ memset(y[ibl].qs, 0, QK_K/4);
8946
+ continue;
8947
+ }
8948
+
8949
+ float d = max_scale/31;
8950
+ y[ibl].d = GGML_FP32_TO_FP16(d);
8951
+ float id = 1/d;
8952
+ float sumqx = 0, sumq2 = 0;
8953
+ for (int ib = 0; ib < QK_K/32; ++ib) {
8954
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
8955
+ l = MAX(0, MIN(15, l));
8956
+ q2[2*ib+1] |= ((uint32_t)l << 28);
8957
+ const float * xb = xbl + 32*ib;
8958
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
8959
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8960
+ const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
8961
+ const float db = d * (1 + 2*l);
8962
+ uint32_t u = 0;
8963
+ for (int k = 0; k < 4; ++k) {
8964
+ const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
8965
+ const float * xk = xb + 8*k;
8966
+ const float * wk = weight + 8*k;
8967
+ const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
8968
+ float best_mse = 0; int best_index = aux8[k];
8969
+ for (int j = 0; j < 8; ++j) {
8970
+ float diff = db * grid[j] * signs[j] - xk[j];
8971
+ best_mse += wk[j] * diff * diff;
8972
+ }
8973
+ for (int idx = 0; idx < 256; ++idx) {
8974
+ grid = (const uint8_t *)(kgrid_q2xs + idx);
8975
+ float mse = 0;
8976
+ for (int j = 0; j < 8; ++j) {
8977
+ float diff = db * grid[j] * signs[j] - xk[j];
8978
+ mse += wk[j] * diff * diff;
8979
+ }
8980
+ if (mse < best_mse) {
8981
+ best_mse = mse; best_index = idx;
8982
+ }
8983
+ }
8984
+ u |= (best_index << 8*k);
8985
+ grid = (const uint8_t *)(kgrid_q2xs + best_index);
8986
+ //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
8987
+ for (int j = 0; j < 8; ++j) {
8988
+ float q = db * grid[j] * signs[j];
8989
+ sumqx += wk[j] * q * xk[j];
8990
+ sumq2 += wk[j] * q * q;
8991
+ }
8992
+ }
8993
+ q2[2*ib] = u;
8994
+ if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
8995
+ }
8996
+ memcpy(y[ibl].qs, q2, QK_K/4);
8997
+ }
8998
+ }
8999
+
9000
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
9001
+
9002
+ const int gindex = iq2_data_index(512);
9003
+
9004
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
9005
+ const int * kmap_q2xs = iq2_data[gindex].map;
9006
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
9007
+
9008
+ GGML_ASSERT(quant_weights);
9009
+ GGML_ASSERT(kmap_q2xs);
9010
+ GGML_ASSERT(kgrid_q2xs);
9011
+ GGML_ASSERT(kneighbors_q2xs);
9012
+ GGML_ASSERT(n%QK_K == 0);
9013
+
9014
+ const int kMaxQ = 3;
9015
+
9016
+ const int nbl = n/256;
9017
+
9018
+ block_iq2_xs * y = vy;
9019
+
9020
+ float scales[QK_K/16];
9021
+ float weight[16];
9022
+ float xval[16];
9023
+ int8_t L[16];
9024
+ int8_t Laux[16];
9025
+ float waux[16];
9026
+ bool is_on_grid[2];
9027
+ bool is_on_grid_aux[2];
9028
+ uint8_t block_signs[2];
9029
+ uint16_t q2[2*(QK_K/16)];
9030
+
9031
+ for (int ibl = 0; ibl < nbl; ++ibl) {
9032
+
9033
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
9034
+ memset(q2, 0, QK_K/4);
9035
+ memset(y[ibl].scales, 0, QK_K/32);
9036
+
9037
+ float max_scale = 0;
9038
+
9039
+ const float * xbl = x + QK_K*ibl;
9040
+ float sumx2 = 0;
9041
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
9042
+ float sigma2 = sumx2/QK_K;
9043
+
9044
+ for (int ib = 0; ib < QK_K/16; ++ib) {
9045
+ const float * xb = xbl + 16*ib;
9046
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
9047
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
9048
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
9049
+ for (int k = 0; k < 2; ++k) {
9050
+ int nflip = 0;
9051
+ uint8_t s = 0;
9052
+ for (int i = 0; i < 8; ++i) {
9053
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
9054
+ else {
9055
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
9056
+ }
9057
+ }
9058
+ if (nflip%2) {
9059
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
9060
+ for (int i = 1; i < 8; ++i) {
9061
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
9062
+ if (ax < min) {
9063
+ min = ax; imin = i;
9064
+ }
9065
+ }
9066
+ xval[8*k+imin] = -xval[8*k+imin];
9067
+ s ^= (1 << imin);
9068
+ }
9069
+ block_signs[k] = s & 127;
9070
+ }
9071
+ float max = xval[0];
9072
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
9073
+ if (!max) {
9074
+ scales[ib] = 0;
9075
+ memset(L, 0, 16);
9076
+ continue;
9077
+ }
9078
+ float best = 0;
9079
+ float scale = max/(2*kMaxQ-1);
9080
+ is_on_grid[0] = is_on_grid[1] = true;
9081
+ for (int is = -9; is <= 9; ++is) {
9082
+ float id = (2*kMaxQ-1+is*0.1f)/max;
9083
+ float this_scale = 1/id;
9084
+ for (int k = 0; k < 2; ++k) {
9085
+ for (int i = 0; i < 8; ++i) {
9086
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
9087
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
9088
+ }
9089
+ uint16_t u = 0;
9090
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
9091
+ int grid_index = kmap_q2xs[u];
9092
+ is_on_grid_aux[k] = true;
9093
+ if (grid_index < 0) {
9094
+ is_on_grid_aux[k] = false;
9095
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9096
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
9097
+ }
9098
+ }
9099
+ float sumqx = 0, sumq2 = 0;
9100
+ for (int i = 0; i < 16; ++i) {
9101
+ float w = weight[i];
9102
+ float q = 2*Laux[i] + 1;
9103
+ sumqx += w*xval[i]*q;
9104
+ sumq2 += w*q*q;
9105
+ }
9106
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
9107
+ scale = sumqx/sumq2; best = scale*sumqx;
9108
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
9109
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
9110
+ }
9111
+ }
9112
+ int n_not_ongrid = 0;
9113
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
9114
+ if (n_not_ongrid > 0 && scale > 0) {
9115
+ float id = 1/scale;
9116
+ for (int k = 0; k < 2; ++k) {
9117
+ if (is_on_grid[k]) continue;
9118
+ uint16_t u = 0;
9119
+ for (int i = 0; i < 8; ++i) {
9120
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
9121
+ l = MAX(0, MIN(kMaxQ-1, l));
9122
+ u |= (l << 2*i);
9123
+ L[8*k + i] = l;
9124
+ }
9125
+ int grid_index = kmap_q2xs[u];
9126
+ if (grid_index < 0) {
9127
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9128
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
9129
+ }
9130
+ }
9131
+ float sumqx = 0, sumq2 = 0;
9132
+ for (int i = 0; i < 16; ++i) {
9133
+ float w = weight[i];
9134
+ float q = 2*L[i] + 1;
9135
+ sumqx += w*xval[i]*q;
9136
+ sumq2 += w*q*q;
9137
+ }
9138
+ if (sumq2 > 0) scale = sumqx/sumq2;
9139
+ }
9140
+ if (scale < 0) {
9141
+ scale = -scale;
9142
+ for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
9143
+ }
9144
+ for (int k = 0; k < 2; ++k) {
9145
+ uint16_t u = 0;
9146
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
9147
+ int grid_index = kmap_q2xs[u];
9148
+ if (grid_index < 0) {
9149
+ printf("Oops: found point %u not on grid:", u);
9150
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
9151
+ printf("\n");
9152
+ GGML_ASSERT(false);
9153
+ }
9154
+ q2[2*ib+k] = grid_index | (block_signs[k] << 9);
9155
+ }
9156
+ GGML_ASSERT(scale >= 0);
9157
+ scales[ib] = scale;
9158
+ max_scale = MAX(max_scale, scale);
9159
+ }
9160
+
9161
+ if (!max_scale) {
9162
+ memset(y[ibl].qs, 0, QK_K/4);
9163
+ continue;
9164
+ }
9165
+
9166
+ float d = max_scale/31;
9167
+ y[ibl].d = GGML_FP32_TO_FP16(d);
9168
+ float id = 1/d;
9169
+ for (int ib = 0; ib < QK_K/16; ++ib) {
9170
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
9171
+ l = MAX(0, MIN(15, l));
9172
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
9173
+ else y[ibl].scales[ib/2] |= (l << 4);
9174
+ }
9175
+ memcpy(y[ibl].qs, q2, QK_K/4);
9176
+
9177
+ }
9178
+ }
9179
+
9180
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
9181
+ (void)hist;
9182
+ GGML_ASSERT(n_per_row%QK_K == 0);
9183
+ int nblock = n_per_row/QK_K;
9184
+ char * qrow = (char *)dst;
9185
+ for (int row = 0; row < nrow; ++row) {
9186
+ quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
9187
+ src += n_per_row;
9188
+ qrow += nblock*sizeof(block_iq2_xxs);
9189
+ }
9190
+ return nrow * nblock * sizeof(block_iq2_xxs);
9191
+ }
9192
+
9193
+ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
9194
+ (void)hist;
9195
+ GGML_ASSERT(n_per_row%QK_K == 0);
9196
+ int nblock = n_per_row/QK_K;
9197
+ char * qrow = (char *)dst;
9198
+ for (int row = 0; row < nrow; ++row) {
9199
+ quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
9200
+ src += n_per_row;
9201
+ qrow += nblock*sizeof(block_iq2_xs);
9202
+ }
9203
+ return nrow * nblock * sizeof(block_iq2_xs);
9204
+ }
9205
+