llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,8 @@
5
5
  #include <string.h>
6
6
  #include <assert.h>
7
7
  #include <float.h>
8
+ #include <stdlib.h> // for qsort
9
+ #include <stdio.h> // for GGML_ASSERT
8
10
 
9
11
  #ifdef __ARM_NEON
10
12
 
@@ -272,10 +274,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
272
274
 
273
275
  // vaddvq_s16
274
276
  // vpaddq_s16
277
+ // vpaddq_s32
275
278
  // vaddvq_s32
276
279
  // vaddvq_f32
277
280
  // vmaxvq_f32
278
281
  // vcvtnq_s32_f32
282
+ // vzip1_u8
283
+ // vzip2_u8
279
284
 
280
285
  inline static int32_t vaddvq_s16(int16x8_t v) {
281
286
  return
@@ -291,6 +296,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
291
296
  return vcombine_s16(a0, b0);
292
297
  }
293
298
 
299
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
300
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
301
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
302
+ return vcombine_s32(a0, b0);
303
+ }
304
+
294
305
  inline static int32_t vaddvq_s32(int32x4_t v) {
295
306
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
296
307
  }
@@ -316,6 +327,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
316
327
  return res;
317
328
  }
318
329
 
330
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
331
+ uint8x8_t res;
332
+
333
+ res[0] = a[0]; res[1] = b[0];
334
+ res[2] = a[1]; res[3] = b[1];
335
+ res[4] = a[2]; res[5] = b[2];
336
+ res[6] = a[3]; res[7] = b[3];
337
+
338
+ return res;
339
+ }
340
+
341
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
342
+ uint8x8_t res;
343
+
344
+ res[0] = a[4]; res[1] = b[4];
345
+ res[2] = a[5]; res[3] = b[5];
346
+ res[4] = a[6]; res[5] = b[6];
347
+ res[6] = a[7]; res[7] = b[7];
348
+
349
+ return res;
350
+ }
351
+
319
352
  // vld1q_s16_x2
320
353
  // vld1q_u8_x2
321
354
  // vld1q_u8_x4
@@ -482,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
482
515
  quantize_row_q4_0_reference(x, y, k);
483
516
  }
484
517
 
518
+
485
519
  void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
486
520
  const int qk = QK4_1;
487
521
 
@@ -1211,7 +1245,8 @@ static inline int nearest_int(float fval) {
1211
1245
  return (i & 0x007fffff) - 0x00400000;
1212
1246
  }
1213
1247
 
1214
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
1248
+ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
1249
+ const float * restrict qw) {
1215
1250
  float max = 0;
1216
1251
  float amax = 0;
1217
1252
  for (int i = 0; i < n; ++i) {
@@ -1237,14 +1272,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1237
1272
  rmse_type = -rmse_type;
1238
1273
  return_early = true;
1239
1274
  }
1240
- int weight_type = rmse_type%2;
1241
1275
  float sumlx = 0;
1242
1276
  float suml2 = 0;
1243
1277
  for (int i = 0; i < n; ++i) {
1244
1278
  int l = nearest_int(iscale * x[i]);
1245
1279
  l = MAX(-nmax, MIN(nmax-1, l));
1246
1280
  L[i] = l + nmax;
1247
- float w = weight_type == 1 ? x[i] * x[i] : 1;
1281
+ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
1248
1282
  sumlx += w*x[i]*l;
1249
1283
  suml2 += w*l*l;
1250
1284
  }
@@ -1260,7 +1294,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1260
1294
  for (int i = 0; i < n; ++i) {
1261
1295
  int l = nearest_int(iscale * x[i]);
1262
1296
  l = MAX(-nmax, MIN(nmax-1, l));
1263
- float w = weight_type == 1 ? x[i] * x[i] : 1;
1297
+ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
1264
1298
  sumlx += w*x[i]*l;
1265
1299
  suml2 += w*l*l;
1266
1300
  }
@@ -1608,6 +1642,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
1608
1642
  return (n/QK_K*sizeof(block_q2_K));
1609
1643
  }
1610
1644
 
1645
+ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1646
+ uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1647
+ float rmin, float rdelta, int nstep, bool use_mad) {
1648
+ float min = x[0];
1649
+ float max = x[0];
1650
+ float sum_w = weights ? weights[0] : x[0]*x[0];
1651
+ float sum_x = sum_w * x[0];
1652
+ for (int i = 1; i < n; ++i) {
1653
+ if (x[i] < min) min = x[i];
1654
+ if (x[i] > max) max = x[i];
1655
+ float w = weights ? weights[i] : x[i]*x[i];
1656
+ sum_w += w;
1657
+ sum_x += w * x[i];
1658
+ }
1659
+ if (min > 0) {
1660
+ min = 0;
1661
+ }
1662
+ if (max <= min) {
1663
+ for (int i = 0; i < n; ++i) L[i] = 0;
1664
+ *the_min = -min;
1665
+ return 0.f;
1666
+ }
1667
+ float iscale = nmax/(max - min);
1668
+ float scale = 1/iscale;
1669
+ float best_mad = 0;
1670
+ for (int i = 0; i < n; ++i) {
1671
+ int l = nearest_int(iscale*(x[i] - min));
1672
+ L[i] = MAX(0, MIN(nmax, l));
1673
+ float diff = scale * L[i] + min - x[i];
1674
+ diff = use_mad ? fabsf(diff) : diff*diff;
1675
+ float w = weights ? weights[i] : x[i]*x[i];
1676
+ best_mad += w * diff;
1677
+ }
1678
+ if (nstep < 1) {
1679
+ *the_min = -min;
1680
+ return scale;
1681
+ }
1682
+ for (int is = 0; is <= nstep; ++is) {
1683
+ iscale = (rmin + rdelta*is + nmax)/(max - min);
1684
+ float sum_l = 0, sum_l2 = 0, sum_xl = 0;
1685
+ for (int i = 0; i < n; ++i) {
1686
+ int l = nearest_int(iscale*(x[i] - min));
1687
+ l = MAX(0, MIN(nmax, l));
1688
+ Laux[i] = l;
1689
+ float w = weights ? weights[i] : x[i]*x[i];
1690
+ sum_l += w*l;
1691
+ sum_l2 += w*l*l;
1692
+ sum_xl += w*l*x[i];
1693
+ }
1694
+ float D = sum_w * sum_l2 - sum_l * sum_l;
1695
+ if (D > 0) {
1696
+ float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
1697
+ float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
1698
+ if (this_min > 0) {
1699
+ this_min = 0;
1700
+ this_scale = sum_xl / sum_l2;
1701
+ }
1702
+ float mad = 0;
1703
+ for (int i = 0; i < n; ++i) {
1704
+ float diff = this_scale * Laux[i] + this_min - x[i];
1705
+ diff = use_mad ? fabsf(diff) : diff*diff;
1706
+ float w = weights ? weights[i] : x[i]*x[i];
1707
+ mad += w * diff;
1708
+ }
1709
+ if (mad < best_mad) {
1710
+ for (int i = 0; i < n; ++i) {
1711
+ L[i] = Laux[i];
1712
+ }
1713
+ best_mad = mad;
1714
+ scale = this_scale;
1715
+ min = this_min;
1716
+ }
1717
+ }
1718
+ }
1719
+ *the_min = -min;
1720
+ return scale;
1721
+ }
1722
+
1723
+ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
1724
+ float max = 0;
1725
+ for (int i = 0; i < n; ++i) {
1726
+ max = MAX(max, x[i]);
1727
+ }
1728
+ if (!max) { // all zero
1729
+ for (int i = 0; i < n; ++i) { L[i] = 0; }
1730
+ return 0.f;
1731
+ }
1732
+ float iscale = nmax / max;
1733
+ for (int i = 0; i < n; ++i) {
1734
+ L[i] = nearest_int(iscale * x[i]);
1735
+ }
1736
+ float scale = 1/iscale;
1737
+ float best_mse = 0;
1738
+ for (int i = 0; i < n; ++i) {
1739
+ float diff = x[i] - scale*L[i];
1740
+ float w = quant_weights[i];
1741
+ best_mse += w*diff*diff;
1742
+ }
1743
+ for (int is = -4; is <= 4; ++is) {
1744
+ if (is == 0) continue;
1745
+ float iscale_is = (0.1f*is + nmax)/max;
1746
+ float scale_is = 1/iscale_is;
1747
+ float mse = 0;
1748
+ for (int i = 0; i < n; ++i) {
1749
+ int l = nearest_int(iscale_is*x[i]);
1750
+ l = MIN(nmax, l);
1751
+ float diff = x[i] - scale_is*l;
1752
+ float w = quant_weights[i];
1753
+ mse += w*diff*diff;
1754
+ }
1755
+ if (mse < best_mse) {
1756
+ best_mse = mse;
1757
+ iscale = iscale_is;
1758
+ }
1759
+ }
1760
+ float sumlx = 0;
1761
+ float suml2 = 0;
1762
+ for (int i = 0; i < n; ++i) {
1763
+ int l = nearest_int(iscale * x[i]);
1764
+ l = MIN(nmax, l);
1765
+ L[i] = l;
1766
+ float w = quant_weights[i];
1767
+ sumlx += w*x[i]*l;
1768
+ suml2 += w*l*l;
1769
+ }
1770
+ for (int itry = 0; itry < 5; ++itry) {
1771
+ int n_changed = 0;
1772
+ for (int i = 0; i < n; ++i) {
1773
+ float w = quant_weights[i];
1774
+ float slx = sumlx - w*x[i]*L[i];
1775
+ float sl2 = suml2 - w*L[i]*L[i];
1776
+ if (slx > 0 && sl2 > 0) {
1777
+ int new_l = nearest_int(x[i] * sl2 / slx);
1778
+ new_l = MIN(nmax, new_l);
1779
+ if (new_l != L[i]) {
1780
+ slx += w*x[i]*new_l;
1781
+ sl2 += w*new_l*new_l;
1782
+ if (slx*slx*suml2 > sumlx*sumlx*sl2) {
1783
+ L[i] = new_l; sumlx = slx; suml2 = sl2;
1784
+ ++n_changed;
1785
+ }
1786
+ }
1787
+ }
1788
+ }
1789
+ if (!n_changed) {
1790
+ break;
1791
+ }
1792
+ }
1793
+ return sumlx / suml2;
1794
+ }
1795
+
1796
+ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
1797
+ GGML_ASSERT(quant_weights);
1798
+ assert(k % QK_K == 0);
1799
+ const int nb = k / QK_K;
1800
+ const bool requantize = true;
1801
+
1802
+ uint8_t L[QK_K];
1803
+ uint8_t Laux[16];
1804
+ float mins[QK_K/16];
1805
+ float scales[QK_K/16];
1806
+ float sw[QK_K/16];
1807
+ float weight[QK_K/16];
1808
+ uint8_t Ls[QK_K/16], Lm[QK_K/16];
1809
+
1810
+ for (int i = 0; i < nb; i++) {
1811
+ memset(sw, 0, QK_K/16*sizeof(float));
1812
+ float sumx2 = 0;
1813
+ for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
1814
+ float sigma2 = sumx2/QK_K;
1815
+ for (int j = 0; j < QK_K/16; ++j) {
1816
+ const float * restrict qw = quant_weights + QK_K * i + 16*j;
1817
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1818
+ for (int l = 0; l < 16; ++l) sw[j] += weight[l];
1819
+ scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1820
+ }
1821
+
1822
+ float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1823
+ float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1824
+ y[i].d = GGML_FP32_TO_FP16(dm);
1825
+ y[i].dmin = GGML_FP32_TO_FP16(mm);
1826
+ dm = GGML_FP16_TO_FP32(y[i].d);
1827
+ mm = GGML_FP16_TO_FP32(y[i].dmin);
1828
+
1829
+ for (int j = 0; j < QK_K/16; ++j) {
1830
+ y[i].scales[j] = Ls[j] | (Lm[j] << 4);
1831
+ }
1832
+
1833
+ if (requantize) {
1834
+ for (int j = 0; j < QK_K/16; ++j) {
1835
+ const float d = dm * (y[i].scales[j] & 0xF);
1836
+ if (!d) continue;
1837
+ const float m = mm * (y[i].scales[j] >> 4);
1838
+ for (int ii = 0; ii < 16; ++ii) {
1839
+ int l = nearest_int((x[16*j + ii] + m)/d);
1840
+ l = MAX(0, MIN(3, l));
1841
+ L[16*j + ii] = l;
1842
+ }
1843
+ }
1844
+ }
1845
+
1846
+ #if QK_K == 256
1847
+ for (int j = 0; j < QK_K; j += 128) {
1848
+ for (int l = 0; l < 32; ++l) {
1849
+ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
1850
+ }
1851
+ }
1852
+ #else
1853
+ for (int l = 0; l < 16; ++l) {
1854
+ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
1855
+ }
1856
+ #endif
1857
+
1858
+ x += QK_K;
1859
+
1860
+ }
1861
+ }
1862
+
1863
+ size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1864
+ (void)hist;
1865
+ int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1866
+ if (!quant_weights) {
1867
+ quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1868
+ }
1869
+ else {
1870
+ char * qrow = (char *)dst;
1871
+ for (int row = 0; row < nrow; ++row) {
1872
+ quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1873
+ src += n_per_row;
1874
+ qrow += row_size;
1875
+ }
1876
+ }
1877
+ return nrow * row_size;
1878
+ }
1879
+
1611
1880
  //========================= 3-bit (de)-quantization
1612
1881
 
1613
1882
  void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
@@ -1821,6 +2090,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
1821
2090
  return (n/QK_K*sizeof(block_q3_K));
1822
2091
  }
1823
2092
 
2093
+ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2094
+ #if QK_K != 256
2095
+ (void)quant_weights;
2096
+ quantize_row_q3_K_reference(x, y, n_per_row);
2097
+ #else
2098
+ assert(n_per_row % QK_K == 0);
2099
+ const int nb = n_per_row / QK_K;
2100
+
2101
+ int8_t L[QK_K];
2102
+ float scales[QK_K / 16];
2103
+ float weight[16];
2104
+ float sw[QK_K / 16];
2105
+ int8_t Ls[QK_K / 16];
2106
+
2107
+ for (int i = 0; i < nb; i++) {
2108
+
2109
+ float sumx2 = 0;
2110
+ for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
2111
+ float sigma2 = 2*sumx2/QK_K;
2112
+
2113
+ for (int j = 0; j < QK_K/16; ++j) {
2114
+ if (quant_weights) {
2115
+ const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
2116
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
2117
+ } else {
2118
+ for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
2119
+ }
2120
+ float sumw = 0;
2121
+ for (int l = 0; l < 16; ++l) sumw += weight[l];
2122
+ sw[j] = sumw;
2123
+
2124
+ scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
2125
+
2126
+ }
2127
+
2128
+ memset(y[i].scales, 0, 12);
2129
+
2130
+ float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
2131
+ for (int j = 0; j < QK_K/16; ++j) {
2132
+ int l = Ls[j];
2133
+ if (j < 8) {
2134
+ y[i].scales[j] = l & 0xF;
2135
+ } else {
2136
+ y[i].scales[j-8] |= ((l & 0xF) << 4);
2137
+ }
2138
+ l >>= 4;
2139
+ y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
2140
+ }
2141
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2142
+
2143
+ int8_t sc;
2144
+ for (int j = 0; j < QK_K/16; ++j) {
2145
+ sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
2146
+ sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
2147
+ float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2148
+ if (!d) {
2149
+ continue;
2150
+ }
2151
+ for (int ii = 0; ii < 16; ++ii) {
2152
+ int l = nearest_int(x[16*j + ii]/d);
2153
+ l = MAX(-4, MIN(3, l));
2154
+ L[16*j + ii] = l + 4;
2155
+ }
2156
+ }
2157
+
2158
+ memset(y[i].hmask, 0, QK_K/8);
2159
+ // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
2160
+ int m = 0;
2161
+ uint8_t hm = 1;
2162
+ for (int j = 0; j < QK_K; ++j) {
2163
+ if (L[j] > 3) {
2164
+ y[i].hmask[m] |= hm;
2165
+ L[j] -= 4;
2166
+ }
2167
+ if (++m == QK_K/8) {
2168
+ m = 0; hm <<= 1;
2169
+ }
2170
+ }
2171
+ for (int j = 0; j < QK_K; j += 128) {
2172
+ for (int l = 0; l < 32; ++l) {
2173
+ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
2174
+ }
2175
+ }
2176
+
2177
+ x += QK_K;
2178
+ }
2179
+ #endif
2180
+ }
2181
+
2182
+ size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2183
+ (void)hist;
2184
+ int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2185
+ if (!quant_weights) {
2186
+ quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
2187
+ }
2188
+ else {
2189
+ char * qrow = (char *)dst;
2190
+ for (int row = 0; row < nrow; ++row) {
2191
+ quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
2192
+ src += n_per_row;
2193
+ qrow += row_size;
2194
+ }
2195
+ }
2196
+ return nrow * row_size;
2197
+ }
2198
+
1824
2199
  // ====================== 4-bit (de)-quantization
1825
2200
 
1826
2201
  void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
@@ -1986,36 +2361,38 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
1986
2361
  return (n/QK_K*sizeof(block_q4_K));
1987
2362
  }
1988
2363
 
1989
- // ====================== 5-bit (de)-quantization
1990
-
1991
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
1992
- assert(k % QK_K == 0);
1993
- const int nb = k / QK_K;
2364
+ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2365
+ #if QK_K != 256
2366
+ (void)quant_weights;
2367
+ quantize_row_q4_K_reference(x, y, n_per_row);
2368
+ #else
2369
+ assert(n_per_row % QK_K == 0);
2370
+ const int nb = n_per_row / QK_K;
1994
2371
 
1995
- #if QK_K == 256
1996
2372
  uint8_t L[QK_K];
2373
+ uint8_t Laux[32];
2374
+ float weights[32];
1997
2375
  float mins[QK_K/32];
1998
2376
  float scales[QK_K/32];
1999
- float weights[32];
2000
- uint8_t Laux[32];
2001
- #else
2002
- int8_t L[QK_K];
2003
- float scales[QK_K/16];
2004
- #endif
2005
2377
 
2006
2378
  for (int i = 0; i < nb; i++) {
2007
2379
 
2008
- #if QK_K == 256
2380
+ float sum_x2 = 0;
2381
+ for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2382
+ float sigma2 = sum_x2/QK_K;
2383
+ float av_x = sqrtf(sigma2);
2009
2384
 
2010
2385
  float max_scale = 0; // as we are deducting the min, scales are always positive
2011
2386
  float max_min = 0;
2012
2387
  for (int j = 0; j < QK_K/32; ++j) {
2013
- //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
2014
- float sum_x2 = 0;
2015
- for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
2016
- float av_x = sqrtf(sum_x2/32);
2017
- for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2018
- scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
2388
+ if (quant_weights) {
2389
+ const float * qw = quant_weights + QK_K*i + 32*j;
2390
+ for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
2391
+ } else {
2392
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2393
+ }
2394
+ scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2395
+ //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2019
2396
  float scale = scales[j];
2020
2397
  if (scale > max_scale) {
2021
2398
  max_scale = scale;
@@ -2053,18 +2430,118 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2053
2430
  const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2054
2431
  for (int ii = 0; ii < 32; ++ii) {
2055
2432
  int l = nearest_int((x[32*j + ii] + dm)/d);
2056
- l = MAX(0, MIN(31, l));
2433
+ l = MAX(0, MIN(15, l));
2057
2434
  L[32*j + ii] = l;
2058
2435
  }
2059
2436
  }
2437
+ uint8_t * q = y[i].qs;
2438
+ for (int j = 0; j < QK_K; j += 64) {
2439
+ for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
2440
+ q += 32;
2441
+ }
2060
2442
 
2061
- uint8_t * restrict qh = y[i].qh;
2062
- uint8_t * restrict ql = y[i].qs;
2063
- memset(qh, 0, QK_K/8);
2443
+ x += QK_K;
2064
2444
 
2065
- uint8_t m1 = 1, m2 = 2;
2066
- for (int n = 0; n < QK_K; n += 64) {
2067
- for (int j = 0; j < 32; ++j) {
2445
+ }
2446
+ #endif
2447
+ }
2448
+
2449
+ size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2450
+ (void)hist;
2451
+ int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2452
+ if (!quant_weights) {
2453
+ quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
2454
+ }
2455
+ else {
2456
+ char * qrow = (char *)dst;
2457
+ for (int row = 0; row < nrow; ++row) {
2458
+ quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
2459
+ src += n_per_row;
2460
+ qrow += row_size;
2461
+ }
2462
+ }
2463
+ return nrow * row_size;
2464
+ }
2465
+
2466
+ // ====================== 5-bit (de)-quantization
2467
+
2468
+ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
2469
+ assert(k % QK_K == 0);
2470
+ const int nb = k / QK_K;
2471
+
2472
+ #if QK_K == 256
2473
+ uint8_t L[QK_K];
2474
+ float mins[QK_K/32];
2475
+ float scales[QK_K/32];
2476
+ float weights[32];
2477
+ uint8_t Laux[32];
2478
+ #else
2479
+ int8_t L[QK_K];
2480
+ float scales[QK_K/16];
2481
+ #endif
2482
+
2483
+ for (int i = 0; i < nb; i++) {
2484
+
2485
+ #if QK_K == 256
2486
+
2487
+ float max_scale = 0; // as we are deducting the min, scales are always positive
2488
+ float max_min = 0;
2489
+ for (int j = 0; j < QK_K/32; ++j) {
2490
+ //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
2491
+ float sum_x2 = 0;
2492
+ for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
2493
+ float av_x = sqrtf(sum_x2/32);
2494
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2495
+ scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
2496
+ float scale = scales[j];
2497
+ if (scale > max_scale) {
2498
+ max_scale = scale;
2499
+ }
2500
+ float min = mins[j];
2501
+ if (min > max_min) {
2502
+ max_min = min;
2503
+ }
2504
+ }
2505
+
2506
+ float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2507
+ float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2508
+ for (int j = 0; j < QK_K/32; ++j) {
2509
+ uint8_t ls = nearest_int(inv_scale*scales[j]);
2510
+ uint8_t lm = nearest_int(inv_min*mins[j]);
2511
+ ls = MIN(63, ls);
2512
+ lm = MIN(63, lm);
2513
+ if (j < 4) {
2514
+ y[i].scales[j] = ls;
2515
+ y[i].scales[j+4] = lm;
2516
+ } else {
2517
+ y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
2518
+ y[i].scales[j-4] |= ((ls >> 4) << 6);
2519
+ y[i].scales[j-0] |= ((lm >> 4) << 6);
2520
+ }
2521
+ }
2522
+ y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2523
+ y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2524
+
2525
+ uint8_t sc, m;
2526
+ for (int j = 0; j < QK_K/32; ++j) {
2527
+ get_scale_min_k4(j, y[i].scales, &sc, &m);
2528
+ const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2529
+ if (!d) continue;
2530
+ const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2531
+ for (int ii = 0; ii < 32; ++ii) {
2532
+ int l = nearest_int((x[32*j + ii] + dm)/d);
2533
+ l = MAX(0, MIN(31, l));
2534
+ L[32*j + ii] = l;
2535
+ }
2536
+ }
2537
+
2538
+ uint8_t * restrict qh = y[i].qh;
2539
+ uint8_t * restrict ql = y[i].qs;
2540
+ memset(qh, 0, QK_K/8);
2541
+
2542
+ uint8_t m1 = 1, m2 = 2;
2543
+ for (int n = 0; n < QK_K; n += 64) {
2544
+ for (int j = 0; j < 32; ++j) {
2068
2545
  int l1 = L[n + j];
2069
2546
  if (l1 > 15) {
2070
2547
  l1 -= 16; qh[j] |= m1;
@@ -2081,7 +2558,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
2081
2558
  #else
2082
2559
  float max_scale = 0, amax = 0;
2083
2560
  for (int j = 0; j < QK_K/16; ++j) {
2084
- scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
2561
+ scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
2085
2562
  float abs_scale = fabsf(scales[j]);
2086
2563
  if (abs_scale > amax) {
2087
2564
  amax = abs_scale;
@@ -2192,6 +2669,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
2192
2669
  return (n/QK_K*sizeof(block_q5_K));
2193
2670
  }
2194
2671
 
2672
+ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2673
+ #if QK_K != 256
2674
+ (void)quant_weights;
2675
+ quantize_row_q5_K_reference(x, y, n_per_row);
2676
+ #else
2677
+ assert(n_per_row % QK_K == 0);
2678
+ const int nb = n_per_row / QK_K;
2679
+
2680
+ uint8_t L[QK_K];
2681
+ float mins[QK_K/32];
2682
+ float scales[QK_K/32];
2683
+ float weights[32];
2684
+ uint8_t Laux[32];
2685
+
2686
+ for (int i = 0; i < nb; i++) {
2687
+
2688
+ float sum_x2 = 0;
2689
+ for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2690
+ float sigma2 = sum_x2/QK_K;
2691
+ float av_x = sqrtf(sigma2);
2692
+
2693
+ float max_scale = 0; // as we are deducting the min, scales are always positive
2694
+ float max_min = 0;
2695
+ for (int j = 0; j < QK_K/32; ++j) {
2696
+ if (quant_weights) {
2697
+ const float * qw = quant_weights + QK_K*i + 32*j;
2698
+ for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
2699
+ } else {
2700
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2701
+ }
2702
+ scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2703
+ float scale = scales[j];
2704
+ if (scale > max_scale) {
2705
+ max_scale = scale;
2706
+ }
2707
+ float min = mins[j];
2708
+ if (min > max_min) {
2709
+ max_min = min;
2710
+ }
2711
+ }
2712
+
2713
+ float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2714
+ float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2715
+ for (int j = 0; j < QK_K/32; ++j) {
2716
+ uint8_t ls = nearest_int(inv_scale*scales[j]);
2717
+ uint8_t lm = nearest_int(inv_min*mins[j]);
2718
+ ls = MIN(63, ls);
2719
+ lm = MIN(63, lm);
2720
+ if (j < 4) {
2721
+ y[i].scales[j] = ls;
2722
+ y[i].scales[j+4] = lm;
2723
+ } else {
2724
+ y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
2725
+ y[i].scales[j-4] |= ((ls >> 4) << 6);
2726
+ y[i].scales[j-0] |= ((lm >> 4) << 6);
2727
+ }
2728
+ }
2729
+ y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2730
+ y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2731
+
2732
+ uint8_t sc, m;
2733
+ for (int j = 0; j < QK_K/32; ++j) {
2734
+ get_scale_min_k4(j, y[i].scales, &sc, &m);
2735
+ const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
2736
+ if (!d) continue;
2737
+ const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
2738
+ for (int ii = 0; ii < 32; ++ii) {
2739
+ int l = nearest_int((x[32*j + ii] + dm)/d);
2740
+ l = MAX(0, MIN(31, l));
2741
+ L[32*j + ii] = l;
2742
+ }
2743
+ }
2744
+
2745
+ uint8_t * restrict qh = y[i].qh;
2746
+ uint8_t * restrict ql = y[i].qs;
2747
+ memset(qh, 0, QK_K/8);
2748
+
2749
+ uint8_t m1 = 1, m2 = 2;
2750
+ for (int n = 0; n < QK_K; n += 64) {
2751
+ for (int j = 0; j < 32; ++j) {
2752
+ int l1 = L[n + j];
2753
+ if (l1 > 15) {
2754
+ l1 -= 16; qh[j] |= m1;
2755
+ }
2756
+ int l2 = L[n + j + 32];
2757
+ if (l2 > 15) {
2758
+ l2 -= 16; qh[j] |= m2;
2759
+ }
2760
+ ql[j] = l1 | (l2 << 4);
2761
+ }
2762
+ m1 <<= 2; m2 <<= 2;
2763
+ ql += 32;
2764
+ }
2765
+
2766
+ x += QK_K;
2767
+
2768
+ }
2769
+ #endif
2770
+ }
2771
+
2772
+ size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2773
+ (void)hist;
2774
+ int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2775
+ if (!quant_weights) {
2776
+ quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
2777
+ }
2778
+ else {
2779
+ char * qrow = (char *)dst;
2780
+ for (int row = 0; row < nrow; ++row) {
2781
+ quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
2782
+ src += n_per_row;
2783
+ qrow += row_size;
2784
+ }
2785
+ }
2786
+ return nrow * row_size;
2787
+ }
2788
+
2195
2789
  // ====================== 6-bit (de)-quantization
2196
2790
 
2197
2791
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
@@ -2208,7 +2802,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2208
2802
 
2209
2803
  for (int ib = 0; ib < QK_K/16; ++ib) {
2210
2804
 
2211
- const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
2805
+ const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
2212
2806
  scales[ib] = scale;
2213
2807
 
2214
2808
  const float abs_scale = fabsf(scale);
@@ -2276,68 +2870,365 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2276
2870
  }
2277
2871
  }
2278
2872
 
2279
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2280
- assert(k % QK_K == 0);
2281
- const int nb = k / QK_K;
2873
+ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
2874
+ assert(k % QK_K == 0);
2875
+ const int nb = k / QK_K;
2876
+
2877
+ for (int i = 0; i < nb; i++) {
2878
+
2879
+ const float d = GGML_FP16_TO_FP32(x[i].d);
2880
+
2881
+ const uint8_t * restrict ql = x[i].ql;
2882
+ const uint8_t * restrict qh = x[i].qh;
2883
+ const int8_t * restrict sc = x[i].scales;
2884
+
2885
+ #if QK_K == 256
2886
+ for (int n = 0; n < QK_K; n += 128) {
2887
+ for (int l = 0; l < 32; ++l) {
2888
+ int is = l/16;
2889
+ const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2890
+ const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2891
+ const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2892
+ const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2893
+ y[l + 0] = d * sc[is + 0] * q1;
2894
+ y[l + 32] = d * sc[is + 2] * q2;
2895
+ y[l + 64] = d * sc[is + 4] * q3;
2896
+ y[l + 96] = d * sc[is + 6] * q4;
2897
+ }
2898
+ y += 128;
2899
+ ql += 64;
2900
+ qh += 32;
2901
+ sc += 8;
2902
+ }
2903
+ #else
2904
+ for (int l = 0; l < 16; ++l) {
2905
+ const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2906
+ const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2907
+ const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2908
+ const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2909
+ y[l+ 0] = d * sc[0] * q1;
2910
+ y[l+16] = d * sc[1] * q2;
2911
+ y[l+32] = d * sc[2] * q3;
2912
+ y[l+48] = d * sc[3] * q4;
2913
+ }
2914
+ y += 64;
2915
+ #endif
2916
+
2917
+ }
2918
+ }
2919
+
2920
+ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2921
+ assert(k % QK_K == 0);
2922
+ block_q6_K * restrict y = vy;
2923
+ quantize_row_q6_K_reference(x, y, k);
2924
+ }
2925
+
2926
+ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
2927
+ assert(k % QK_K == 0);
2928
+ (void)hist; // TODO: collect histograms
2929
+
2930
+ for (int j = 0; j < n; j += k) {
2931
+ block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
2932
+ quantize_row_q6_K_reference(src + j, y, k);
2933
+ }
2934
+ return (n/QK_K*sizeof(block_q6_K));
2935
+ }
2936
+
2937
+ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2938
+ #if QK_K != 256
2939
+ (void)quant_weights;
2940
+ quantize_row_q6_K_reference(x, y, n_per_row);
2941
+ #else
2942
+ assert(n_per_row % QK_K == 0);
2943
+ const int nb = n_per_row / QK_K;
2944
+
2945
+ int8_t L[QK_K];
2946
+ float scales[QK_K/16];
2947
+ //float weights[16];
2948
+
2949
+ for (int i = 0; i < nb; i++) {
2950
+
2951
+ //float sum_x2 = 0;
2952
+ //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
2953
+ //float sigma2 = sum_x2/QK_K;
2954
+
2955
+ float max_scale = 0;
2956
+ float max_abs_scale = 0;
2957
+
2958
+ for (int ib = 0; ib < QK_K/16; ++ib) {
2959
+
2960
+ float scale;
2961
+ if (quant_weights) {
2962
+ const float * qw = quant_weights + QK_K*i + 16*ib;
2963
+ //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
2964
+ //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
2965
+ scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
2966
+ } else {
2967
+ scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
2968
+ }
2969
+ scales[ib] = scale;
2970
+
2971
+ const float abs_scale = fabsf(scale);
2972
+ if (abs_scale > max_abs_scale) {
2973
+ max_abs_scale = abs_scale;
2974
+ max_scale = scale;
2975
+ }
2976
+
2977
+ }
2978
+
2979
+ if (!max_abs_scale) {
2980
+ memset(&y[i], 0, sizeof(block_q6_K));
2981
+ y[i].d = GGML_FP32_TO_FP16(0.f);
2982
+ x += QK_K;
2983
+ continue;
2984
+ }
2985
+
2986
+ float iscale = -128.f/max_scale;
2987
+ y[i].d = GGML_FP32_TO_FP16(1/iscale);
2988
+ for (int ib = 0; ib < QK_K/16; ++ib) {
2989
+ y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
2990
+ }
2991
+
2992
+ for (int j = 0; j < QK_K/16; ++j) {
2993
+ float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
2994
+ if (!d) {
2995
+ continue;
2996
+ }
2997
+ for (int ii = 0; ii < 16; ++ii) {
2998
+ int l = nearest_int(x[16*j + ii]/d);
2999
+ l = MAX(-32, MIN(31, l));
3000
+ L[16*j + ii] = l + 32;
3001
+ }
3002
+ }
3003
+
3004
+ uint8_t * restrict ql = y[i].ql;
3005
+ uint8_t * restrict qh = y[i].qh;
3006
+ for (int j = 0; j < QK_K; j += 128) {
3007
+ for (int l = 0; l < 32; ++l) {
3008
+ const uint8_t q1 = L[j + l + 0] & 0xF;
3009
+ const uint8_t q2 = L[j + l + 32] & 0xF;
3010
+ const uint8_t q3 = L[j + l + 64] & 0xF;
3011
+ const uint8_t q4 = L[j + l + 96] & 0xF;
3012
+ ql[l+ 0] = q1 | (q3 << 4);
3013
+ ql[l+32] = q2 | (q4 << 4);
3014
+ qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
3015
+ }
3016
+ ql += 64;
3017
+ qh += 32;
3018
+ }
3019
+
3020
+ x += QK_K;
3021
+
3022
+ }
3023
+ #endif
3024
+ }
3025
+
3026
+ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3027
+ (void)hist;
3028
+ int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3029
+ if (!quant_weights) {
3030
+ quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
3031
+ }
3032
+ else {
3033
+ char * qrow = (char *)dst;
3034
+ for (int row = 0; row < nrow; ++row) {
3035
+ quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
3036
+ src += n_per_row;
3037
+ qrow += row_size;
3038
+ }
3039
+ }
3040
+ return nrow * row_size;
3041
+ }
3042
+
3043
+ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3044
+ static_assert(QK4_0 == 32, "QK4_0 must be 32");
3045
+
3046
+ if (!quant_weights) {
3047
+ quantize_row_q4_0_reference(x, y, n_per_row);
3048
+ return;
3049
+ }
3050
+
3051
+ float weight[QK4_0];
3052
+ int8_t L[QK4_0];
3053
+
3054
+ float sum_x2 = 0;
3055
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3056
+ float sigma2 = sum_x2/n_per_row;
3057
+
3058
+ const int nb = n_per_row/QK4_0;
3059
+ for (int ib = 0; ib < nb; ++ib) {
3060
+ const float * xb = x + QK4_0 * ib;
3061
+ const float * qw = quant_weights + QK4_0 * ib;
3062
+ for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3063
+ float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
3064
+ y[ib].d = GGML_FP32_TO_FP16(d);
3065
+ for (int j = 0; j < 16; ++j) {
3066
+ y[ib].qs[j] = L[j] | (L[j+16] << 4);
3067
+ }
3068
+ }
3069
+ }
3070
+
3071
+ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3072
+ if (!quant_weights) {
3073
+ return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3074
+ }
3075
+ int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3076
+ char * qrow = (char *)dst;
3077
+ for (int row = 0; row < nrow; ++row) {
3078
+ quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3079
+ src += n_per_row;
3080
+ qrow += row_size;
3081
+ }
3082
+ return nrow * row_size;
3083
+ }
3084
+
3085
+ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3086
+ static_assert(QK4_1 == 32, "QK4_1 must be 32");
2282
3087
 
2283
- for (int i = 0; i < nb; i++) {
3088
+ if (!quant_weights) {
3089
+ quantize_row_q4_1_reference(x, y, n_per_row);
3090
+ return;
3091
+ }
2284
3092
 
2285
- const float d = GGML_FP16_TO_FP32(x[i].d);
3093
+ float weight[QK4_1];
3094
+ uint8_t L[QK4_1], Laux[QK4_1];
2286
3095
 
2287
- const uint8_t * restrict ql = x[i].ql;
2288
- const uint8_t * restrict qh = x[i].qh;
2289
- const int8_t * restrict sc = x[i].scales;
3096
+ float sum_x2 = 0;
3097
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3098
+ float sigma2 = sum_x2/n_per_row;
2290
3099
 
2291
- #if QK_K == 256
2292
- for (int n = 0; n < QK_K; n += 128) {
2293
- for (int l = 0; l < 32; ++l) {
2294
- int is = l/16;
2295
- const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2296
- const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2297
- const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2298
- const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2299
- y[l + 0] = d * sc[is + 0] * q1;
2300
- y[l + 32] = d * sc[is + 2] * q2;
2301
- y[l + 64] = d * sc[is + 4] * q3;
2302
- y[l + 96] = d * sc[is + 6] * q4;
2303
- }
2304
- y += 128;
2305
- ql += 64;
2306
- qh += 32;
2307
- sc += 8;
3100
+ const int nb = n_per_row/QK4_1;
3101
+ for (int ib = 0; ib < nb; ++ib) {
3102
+ const float * xb = x + QK4_1 * ib;
3103
+ const float * qw = quant_weights + QK4_1 * ib;
3104
+ for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3105
+ float min;
3106
+ float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
3107
+ y[ib].d = GGML_FP32_TO_FP16(d);
3108
+ y[ib].m = GGML_FP32_TO_FP16(-min);
3109
+ for (int j = 0; j < 16; ++j) {
3110
+ y[ib].qs[j] = L[j] | (L[j+16] << 4);
2308
3111
  }
2309
- #else
2310
- for (int l = 0; l < 16; ++l) {
2311
- const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2312
- const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2313
- const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2314
- const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2315
- y[l+ 0] = d * sc[0] * q1;
2316
- y[l+16] = d * sc[1] * q2;
2317
- y[l+32] = d * sc[2] * q3;
2318
- y[l+48] = d * sc[3] * q4;
3112
+ }
3113
+ }
3114
+
3115
+ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3116
+ if (!quant_weights) {
3117
+ return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3118
+ }
3119
+ int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3120
+ char * qrow = (char *)dst;
3121
+ for (int row = 0; row < nrow; ++row) {
3122
+ quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3123
+ src += n_per_row;
3124
+ qrow += row_size;
3125
+ }
3126
+ return nrow * row_size;
3127
+ }
3128
+
3129
+ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3130
+ static_assert(QK5_0 == 32, "QK5_0 must be 32");
3131
+
3132
+ if (!quant_weights) {
3133
+ quantize_row_q5_0_reference(x, y, n_per_row);
3134
+ return;
3135
+ }
3136
+
3137
+ float weight[QK5_0];
3138
+ int8_t L[QK5_0];
3139
+
3140
+ float sum_x2 = 0;
3141
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142
+ float sigma2 = sum_x2/n_per_row;
3143
+
3144
+ const int nb = n_per_row/QK5_0;
3145
+ for (int ib = 0; ib < nb; ++ib) {
3146
+ const float * xb = x + QK5_0 * ib;
3147
+ const float * qw = quant_weights + QK5_0 * ib;
3148
+ for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3149
+ float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
3150
+ y[ib].d = GGML_FP32_TO_FP16(d);
3151
+
3152
+ uint32_t qh = 0;
3153
+
3154
+ for (int j = 0; j < 16; ++j) {
3155
+ const uint8_t xi0 = L[j];
3156
+ const uint8_t xi1 = L[j+16];
3157
+ y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
3158
+
3159
+ // get the 5-th bit and store it in qh at the right position
3160
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
3161
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
2319
3162
  }
2320
- y += 64;
2321
- #endif
2322
3163
 
3164
+ memcpy(&y[ib].qh, &qh, sizeof(qh));
2323
3165
  }
2324
3166
  }
2325
3167
 
2326
- void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
2327
- assert(k % QK_K == 0);
2328
- block_q6_K * restrict y = vy;
2329
- quantize_row_q6_K_reference(x, y, k);
3168
+ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3169
+ if (!quant_weights) {
3170
+ return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3171
+ }
3172
+ int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3173
+ char * qrow = (char *)dst;
3174
+ for (int row = 0; row < nrow; ++row) {
3175
+ quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3176
+ src += n_per_row;
3177
+ qrow += row_size;
3178
+ }
3179
+ return nrow * row_size;
2330
3180
  }
2331
3181
 
2332
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
2333
- assert(k % QK_K == 0);
2334
- (void)hist; // TODO: collect histograms
3182
+ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3183
+ static_assert(QK5_1 == 32, "QK5_1 must be 32");
2335
3184
 
2336
- for (int j = 0; j < n; j += k) {
2337
- block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
2338
- quantize_row_q6_K_reference(src + j, y, k);
3185
+ if (!quant_weights) {
3186
+ quantize_row_q5_1_reference(x, y, n_per_row);
3187
+ return;
2339
3188
  }
2340
- return (n/QK_K*sizeof(block_q6_K));
3189
+
3190
+ float weight[QK5_1];
3191
+ uint8_t L[QK5_1], Laux[QK5_1];
3192
+
3193
+ float sum_x2 = 0;
3194
+ for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3195
+ float sigma2 = sum_x2/n_per_row;
3196
+
3197
+ const int nb = n_per_row/QK5_1;
3198
+ for (int ib = 0; ib < nb; ++ib) {
3199
+ const float * xb = x + QK5_1 * ib;
3200
+ const float * qw = quant_weights + QK5_1 * ib;
3201
+ for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3202
+ float min;
3203
+ float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
3204
+ y[ib].d = GGML_FP32_TO_FP16(d);
3205
+ y[ib].m = GGML_FP32_TO_FP16(-min);
3206
+
3207
+ uint32_t qh = 0;
3208
+ for (int j = 0; j < 16; ++j) {
3209
+ const uint8_t xi0 = L[j];
3210
+ const uint8_t xi1 = L[j+16];
3211
+ y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
3212
+ // get the 5-th bit and store it in qh at the right position
3213
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
3214
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
3215
+ }
3216
+ memcpy(&y[ib].qh, &qh, sizeof(qh));
3217
+ }
3218
+ }
3219
+
3220
+ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3221
+ if (!quant_weights) {
3222
+ return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3223
+ }
3224
+ int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3225
+ char * qrow = (char *)dst;
3226
+ for (int row = 0; row < nrow; ++row) {
3227
+ quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3228
+ src += n_per_row;
3229
+ qrow += row_size;
3230
+ }
3231
+ return nrow * row_size;
2341
3232
  }
2342
3233
 
2343
3234
  // ====================== "True" 2-bit (de)-quantization
@@ -2553,14 +3444,6 @@ static const uint8_t ksigns_iq2xs[128] = {
2553
3444
 
2554
3445
  static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2555
3446
 
2556
- void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
2557
- (void)x;
2558
- (void)y;
2559
- (void)k;
2560
- assert(k % QK_K == 0);
2561
- //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2562
- }
2563
-
2564
3447
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
2565
3448
  assert(k % QK_K == 0);
2566
3449
  const int nb = k / QK_K;
@@ -2587,33 +3470,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
2587
3470
  }
2588
3471
  }
2589
3472
 
2590
- void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
2591
- assert(k % QK_K == 0);
2592
- block_iq2_xxs * restrict y = vy;
2593
- quantize_row_iq2_xxs_reference(x, y, k);
2594
- }
2595
-
2596
- size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
2597
- assert(k % QK_K == 0);
2598
- (void)hist; // TODO: collect histograms
2599
-
2600
- for (int j = 0; j < n; j += k) {
2601
- block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
2602
- quantize_row_iq2_xxs_reference(src + j, y, k);
2603
- }
2604
- return (n/QK_K*sizeof(block_iq2_xxs));
2605
- }
2606
-
2607
3473
  // ====================== 2.3125 bpw (de)-quantization
2608
3474
 
2609
- void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
2610
- (void)x;
2611
- (void)y;
2612
- (void)k;
2613
- assert(k % QK_K == 0);
2614
- //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2615
- }
2616
-
2617
3475
  void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
2618
3476
  assert(k % QK_K == 0);
2619
3477
  const int nb = k / QK_K;
@@ -2639,23 +3497,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
2639
3497
  }
2640
3498
  }
2641
3499
 
2642
- void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
2643
- assert(k % QK_K == 0);
2644
- block_iq2_xs * restrict y = vy;
2645
- quantize_row_iq2_xs_reference(x, y, k);
2646
- }
2647
-
2648
- size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
2649
- assert(k % QK_K == 0);
2650
- (void)hist; // TODO: collect histograms
2651
-
2652
- for (int j = 0; j < n; j += k) {
2653
- block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
2654
- quantize_row_iq2_xs_reference(src + j, y, k);
2655
- }
2656
- return (n/QK_K*sizeof(block_iq2_xs));
2657
- }
2658
-
2659
3500
  //===================================== Q8_K ==============================================
2660
3501
 
2661
3502
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -7554,9 +8395,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7554
8395
 
7555
8396
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7556
8397
 
7557
- int8x16x4_t q2u;
7558
- int8x16x4_t q2s;
7559
- int8x16x4_t q8b;
8398
+ ggml_int8x16x4_t q2u;
8399
+ ggml_int8x16x4_t q2s;
8400
+ ggml_int8x16x4_t q8b;
7560
8401
 
7561
8402
  int32x4x4_t scales32;
7562
8403
 
@@ -7578,7 +8419,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7578
8419
  scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
7579
8420
  int32x4_t sumi = vdupq_n_s32(0);
7580
8421
  for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
7581
- q8b = vld1q_s8_x4(q8); q8 += 64;
8422
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
7582
8423
  q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
7583
8424
  q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
7584
8425
  q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
@@ -7699,3 +8540,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7699
8540
  *s = 0.125f * sumf;
7700
8541
  #endif
7701
8542
  }
8543
+
8544
+ // ================================ IQ2 quantization =============================================
8545
+
8546
+ typedef struct {
8547
+ uint64_t * grid;
8548
+ int * map;
8549
+ uint16_t * neighbours;
8550
+ } iq2_entry_t;
8551
+
8552
+ static iq2_entry_t iq2_data[2] = {
8553
+ {NULL, NULL, NULL},
8554
+ {NULL, NULL, NULL},
8555
+ };
8556
+
8557
+ static inline int iq2_data_index(int grid_size) {
8558
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
8559
+ return grid_size == 256 ? 0 : 1;
8560
+ }
8561
+
8562
+ static int iq2_compare_func(const void * left, const void * right) {
8563
+ const int * l = (const int *)left;
8564
+ const int * r = (const int *)right;
8565
+ return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
8566
+ }
8567
+
8568
+ static void q2xs_init_impl(int grid_size) {
8569
+ const int gindex = iq2_data_index(grid_size);
8570
+ if (iq2_data[gindex].grid) {
8571
+ return;
8572
+ }
8573
+ static const uint16_t kgrid_256[256] = {
8574
+ 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
8575
+ 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
8576
+ 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
8577
+ 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
8578
+ 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
8579
+ 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
8580
+ 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
8581
+ 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
8582
+ 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
8583
+ 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
8584
+ 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
8585
+ 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
8586
+ 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
8587
+ 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
8588
+ 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
8589
+ 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
8590
+ };
8591
+ static const uint16_t kgrid_512[512] = {
8592
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
8593
+ 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
8594
+ 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
8595
+ 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
8596
+ 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
8597
+ 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
8598
+ 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
8599
+ 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
8600
+ 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
8601
+ 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
8602
+ 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
8603
+ 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
8604
+ 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
8605
+ 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
8606
+ 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
8607
+ 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
8608
+ 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
8609
+ 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
8610
+ 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
8611
+ 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
8612
+ 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
8613
+ 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
8614
+ 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
8615
+ 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
8616
+ 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
8617
+ 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
8618
+ 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
8619
+ 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
8620
+ 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
8621
+ 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
8622
+ 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
8623
+ 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
8624
+ };
8625
+ const int kmap_size = 43692;
8626
+ const int nwant = 2;
8627
+ const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
8628
+ uint64_t * kgrid_q2xs;
8629
+ int * kmap_q2xs;
8630
+ uint16_t * kneighbors_q2xs;
8631
+
8632
+ printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
8633
+ uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
8634
+ for (int k = 0; k < grid_size; ++k) {
8635
+ int8_t * pos = (int8_t *)(the_grid + k);
8636
+ for (int i = 0; i < 8; ++i) {
8637
+ int l = (kgrid[k] >> 2*i) & 0x3;
8638
+ pos[i] = 2*l + 1;
8639
+ }
8640
+ }
8641
+ kgrid_q2xs = the_grid;
8642
+ iq2_data[gindex].grid = the_grid;
8643
+ kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
8644
+ iq2_data[gindex].map = kmap_q2xs;
8645
+ for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
8646
+ uint64_t aux64;
8647
+ uint8_t * aux8 = (uint8_t *)&aux64;
8648
+ for (int i = 0; i < grid_size; ++i) {
8649
+ aux64 = kgrid_q2xs[i];
8650
+ uint16_t index = 0;
8651
+ for (int k=0; k<8; ++k) {
8652
+ uint16_t q = (aux8[k] - 1)/2;
8653
+ index |= (q << 2*k);
8654
+ }
8655
+ kmap_q2xs[index] = i;
8656
+ }
8657
+ int8_t pos[8];
8658
+ int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
8659
+ int num_neighbors = 0, num_not_in_map = 0;
8660
+ for (int i = 0; i < kmap_size; ++i) {
8661
+ if (kmap_q2xs[i] >= 0) continue;
8662
+ ++num_not_in_map;
8663
+ for (int k = 0; k < 8; ++k) {
8664
+ int l = (i >> 2*k) & 0x3;
8665
+ pos[k] = 2*l + 1;
8666
+ }
8667
+ for (int j = 0; j < grid_size; ++j) {
8668
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
8669
+ int d2 = 0;
8670
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
8671
+ dist2[2*j+0] = d2;
8672
+ dist2[2*j+1] = j;
8673
+ }
8674
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
8675
+ int n = 0; int d2 = dist2[0];
8676
+ int nhave = 1;
8677
+ for (int j = 0; j < grid_size; ++j) {
8678
+ if (dist2[2*j] > d2) {
8679
+ if (nhave == nwant) break;
8680
+ d2 = dist2[2*j];
8681
+ ++nhave;
8682
+ }
8683
+ ++n;
8684
+ }
8685
+ num_neighbors += n;
8686
+ }
8687
+ printf("%s: %d neighbours in total\n", __func__, num_neighbors);
8688
+ kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
8689
+ iq2_data[gindex].neighbours = kneighbors_q2xs;
8690
+ int counter = 0;
8691
+ for (int i = 0; i < kmap_size; ++i) {
8692
+ if (kmap_q2xs[i] >= 0) continue;
8693
+ for (int k = 0; k < 8; ++k) {
8694
+ int l = (i >> 2*k) & 0x3;
8695
+ pos[k] = 2*l + 1;
8696
+ }
8697
+ for (int j = 0; j < grid_size; ++j) {
8698
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
8699
+ int d2 = 0;
8700
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
8701
+ dist2[2*j+0] = d2;
8702
+ dist2[2*j+1] = j;
8703
+ }
8704
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
8705
+ kmap_q2xs[i] = -(counter + 1);
8706
+ int d2 = dist2[0];
8707
+ uint16_t * start = &kneighbors_q2xs[counter++];
8708
+ int n = 0, nhave = 1;
8709
+ for (int j = 0; j < grid_size; ++j) {
8710
+ if (dist2[2*j] > d2) {
8711
+ if (nhave == nwant) break;
8712
+ d2 = dist2[2*j];
8713
+ ++nhave;
8714
+ }
8715
+ kneighbors_q2xs[counter++] = dist2[2*j+1];
8716
+ ++n;
8717
+ }
8718
+ *start = n;
8719
+ }
8720
+ free(dist2);
8721
+ }
8722
+
8723
+ void ggml_init_iq2_quantization(enum ggml_type type) {
8724
+ if (type == GGML_TYPE_IQ2_XXS) {
8725
+ q2xs_init_impl(256);
8726
+ }
8727
+ else if (type == GGML_TYPE_IQ2_XS) {
8728
+ q2xs_init_impl(512);
8729
+ }
8730
+ else {
8731
+ fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8732
+ }
8733
+ }
8734
+
8735
+ static void q2xs_deinit_impl(int grid_size) {
8736
+ GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
8737
+ const int gindex = iq2_data_index(grid_size);
8738
+ if (iq2_data[gindex].grid) {
8739
+ free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
8740
+ free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
8741
+ free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
8742
+ }
8743
+ }
8744
+
8745
+ void ggml_deinit_iq2_quantization(enum ggml_type type) {
8746
+ if (type == GGML_TYPE_IQ2_XXS) {
8747
+ q2xs_deinit_impl(256);
8748
+ }
8749
+ else if (type == GGML_TYPE_IQ2_XS) {
8750
+ q2xs_deinit_impl(512);
8751
+ }
8752
+ else {
8753
+ fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8754
+ }
8755
+ }
8756
+
8757
+ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
8758
+ const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
8759
+ int num_neighbors = neighbours[0];
8760
+ GGML_ASSERT(num_neighbors > 0);
8761
+ float best_d2 = FLT_MAX;
8762
+ int grid_index = -1;
8763
+ for (int j = 1; j <= num_neighbors; ++j) {
8764
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
8765
+ float d2 = 0;
8766
+ for (int i = 0; i < 8; ++i) {
8767
+ float q = pg[i];
8768
+ float diff = scale*q - xval[i];
8769
+ d2 += weight[i]*diff*diff;
8770
+ }
8771
+ if (d2 < best_d2) {
8772
+ best_d2 = d2; grid_index = neighbours[j];
8773
+ }
8774
+ }
8775
+ GGML_ASSERT(grid_index >= 0);
8776
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
8777
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
8778
+ return grid_index;
8779
+ }
8780
+
8781
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
8782
+
8783
+ const int gindex = iq2_data_index(256);
8784
+
8785
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
8786
+ const int * kmap_q2xs = iq2_data[gindex].map;
8787
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8788
+
8789
+ GGML_ASSERT(quant_weights);
8790
+ GGML_ASSERT(kgrid_q2xs);
8791
+ GGML_ASSERT(kmap_q2xs);
8792
+ GGML_ASSERT(kneighbors_q2xs);
8793
+ GGML_ASSERT(n%QK_K == 0);
8794
+
8795
+ const int kMaxQ = 3;
8796
+
8797
+ const int nbl = n/256;
8798
+
8799
+ block_iq2_xxs * y = vy;
8800
+
8801
+ float scales[QK_K/32];
8802
+ float weight[32];
8803
+ float xval[32];
8804
+ int8_t L[32];
8805
+ int8_t Laux[32];
8806
+ float waux[32];
8807
+ bool is_on_grid[4];
8808
+ bool is_on_grid_aux[4];
8809
+ uint8_t block_signs[4];
8810
+ uint32_t q2[2*(QK_K/32)];
8811
+
8812
+ for (int ibl = 0; ibl < nbl; ++ibl) {
8813
+
8814
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
8815
+ memset(q2, 0, QK_K/4);
8816
+
8817
+ float max_scale = 0;
8818
+
8819
+ const float * xbl = x + QK_K*ibl;
8820
+ float sumx2 = 0;
8821
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
8822
+ float sigma2 = sumx2/QK_K;
8823
+
8824
+ for (int ib = 0; ib < QK_K/32; ++ib) {
8825
+ const float * xb = xbl + 32*ib;
8826
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
8827
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8828
+ for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
8829
+ for (int k = 0; k < 4; ++k) {
8830
+ int nflip = 0;
8831
+ uint8_t s = 0;
8832
+ for (int i = 0; i < 8; ++i) {
8833
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
8834
+ else {
8835
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
8836
+ }
8837
+ }
8838
+ if (nflip%2) {
8839
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
8840
+ for (int i = 1; i < 8; ++i) {
8841
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
8842
+ if (ax < min) {
8843
+ min = ax; imin = i;
8844
+ }
8845
+ }
8846
+ xval[8*k+imin] = -xval[8*k+imin];
8847
+ s ^= (1 << imin);
8848
+ }
8849
+ block_signs[k] = s & 127;
8850
+ }
8851
+ float max = xval[0];
8852
+ for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
8853
+ if (!max) {
8854
+ scales[ib] = 0;
8855
+ memset(L, 0, 32);
8856
+ continue;
8857
+ }
8858
+ float best = 0;
8859
+ float scale = max/(2*kMaxQ-1);
8860
+ for (int is = -9; is <= 9; ++is) {
8861
+ float id = (2*kMaxQ-1+is*0.1f)/max;
8862
+ float this_scale = 1/id;
8863
+ for (int k = 0; k < 4; ++k) {
8864
+ for (int i = 0; i < 8; ++i) {
8865
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8866
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
8867
+ }
8868
+ uint16_t u = 0;
8869
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
8870
+ int grid_index = kmap_q2xs[u];
8871
+ is_on_grid_aux[k] = true;
8872
+ if (grid_index < 0) {
8873
+ is_on_grid_aux[k] = false;
8874
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8875
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
8876
+ }
8877
+ }
8878
+ float sumqx = 0, sumq2 = 0;
8879
+ for (int i = 0; i < 32; ++i) {
8880
+ float w = weight[i];
8881
+ float q = 2*Laux[i] + 1;
8882
+ sumqx += w*xval[i]*q;
8883
+ sumq2 += w*q*q;
8884
+ }
8885
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
8886
+ scale = sumqx/sumq2; best = scale*sumqx;
8887
+ for (int i = 0; i < 32; ++i) L[i] = Laux[i];
8888
+ for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
8889
+ }
8890
+ }
8891
+ int n_not_ongrid = 0;
8892
+ for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
8893
+ if (n_not_ongrid > 0 && scale > 0) {
8894
+ float id = 1/scale;
8895
+ for (int k = 0; k < 4; ++k) {
8896
+ if (is_on_grid[k]) continue;
8897
+ uint16_t u = 0;
8898
+ for (int i = 0; i < 8; ++i) {
8899
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8900
+ l = MAX(0, MIN(kMaxQ-1, l));
8901
+ u |= (l << 2*i);
8902
+ }
8903
+ int grid_index = kmap_q2xs[u];
8904
+ if (grid_index < 0) {
8905
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8906
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
8907
+ }
8908
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
8909
+ for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
8910
+ }
8911
+ float sumqx = 0, sumq2 = 0;
8912
+ for (int i = 0; i < 32; ++i) {
8913
+ float w = weight[i];
8914
+ float q = 2*L[i] + 1;
8915
+ sumqx += w*xval[i]*q;
8916
+ sumq2 += w*q*q;
8917
+ }
8918
+ if (sumq2 > 0) scale = sumqx/sumq2;
8919
+ }
8920
+ if (scale < 0) {
8921
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
8922
+ // and correspondingly flip quant signs.
8923
+ scale = -scale;
8924
+ for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
8925
+ }
8926
+ for (int k = 0; k < 4; ++k) {
8927
+ uint16_t u = 0;
8928
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
8929
+ int grid_index = kmap_q2xs[u];
8930
+ if (grid_index < 0) {
8931
+ printf("Oops: found point %u not on grid:", u);
8932
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
8933
+ printf("\n");
8934
+ GGML_ASSERT(false);
8935
+ }
8936
+ q2[2*ib+0] |= (grid_index << 8*k);
8937
+ q2[2*ib+1] |= (block_signs[k] << 7*k);
8938
+ }
8939
+ GGML_ASSERT(scale >= 0);
8940
+ scales[ib] = scale;
8941
+ max_scale = MAX(max_scale, scale);
8942
+ }
8943
+
8944
+ if (!max_scale) {
8945
+ memset(y[ibl].qs, 0, QK_K/4);
8946
+ continue;
8947
+ }
8948
+
8949
+ float d = max_scale/31;
8950
+ y[ibl].d = GGML_FP32_TO_FP16(d);
8951
+ float id = 1/d;
8952
+ float sumqx = 0, sumq2 = 0;
8953
+ for (int ib = 0; ib < QK_K/32; ++ib) {
8954
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
8955
+ l = MAX(0, MIN(15, l));
8956
+ q2[2*ib+1] |= ((uint32_t)l << 28);
8957
+ const float * xb = xbl + 32*ib;
8958
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
8959
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8960
+ const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
8961
+ const float db = d * (1 + 2*l);
8962
+ uint32_t u = 0;
8963
+ for (int k = 0; k < 4; ++k) {
8964
+ const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
8965
+ const float * xk = xb + 8*k;
8966
+ const float * wk = weight + 8*k;
8967
+ const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
8968
+ float best_mse = 0; int best_index = aux8[k];
8969
+ for (int j = 0; j < 8; ++j) {
8970
+ float diff = db * grid[j] * signs[j] - xk[j];
8971
+ best_mse += wk[j] * diff * diff;
8972
+ }
8973
+ for (int idx = 0; idx < 256; ++idx) {
8974
+ grid = (const uint8_t *)(kgrid_q2xs + idx);
8975
+ float mse = 0;
8976
+ for (int j = 0; j < 8; ++j) {
8977
+ float diff = db * grid[j] * signs[j] - xk[j];
8978
+ mse += wk[j] * diff * diff;
8979
+ }
8980
+ if (mse < best_mse) {
8981
+ best_mse = mse; best_index = idx;
8982
+ }
8983
+ }
8984
+ u |= (best_index << 8*k);
8985
+ grid = (const uint8_t *)(kgrid_q2xs + best_index);
8986
+ //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
8987
+ for (int j = 0; j < 8; ++j) {
8988
+ float q = db * grid[j] * signs[j];
8989
+ sumqx += wk[j] * q * xk[j];
8990
+ sumq2 += wk[j] * q * q;
8991
+ }
8992
+ }
8993
+ q2[2*ib] = u;
8994
+ if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
8995
+ }
8996
+ memcpy(y[ibl].qs, q2, QK_K/4);
8997
+ }
8998
+ }
8999
+
9000
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
9001
+
9002
+ const int gindex = iq2_data_index(512);
9003
+
9004
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
9005
+ const int * kmap_q2xs = iq2_data[gindex].map;
9006
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
9007
+
9008
+ GGML_ASSERT(quant_weights);
9009
+ GGML_ASSERT(kmap_q2xs);
9010
+ GGML_ASSERT(kgrid_q2xs);
9011
+ GGML_ASSERT(kneighbors_q2xs);
9012
+ GGML_ASSERT(n%QK_K == 0);
9013
+
9014
+ const int kMaxQ = 3;
9015
+
9016
+ const int nbl = n/256;
9017
+
9018
+ block_iq2_xs * y = vy;
9019
+
9020
+ float scales[QK_K/16];
9021
+ float weight[16];
9022
+ float xval[16];
9023
+ int8_t L[16];
9024
+ int8_t Laux[16];
9025
+ float waux[16];
9026
+ bool is_on_grid[2];
9027
+ bool is_on_grid_aux[2];
9028
+ uint8_t block_signs[2];
9029
+ uint16_t q2[2*(QK_K/16)];
9030
+
9031
+ for (int ibl = 0; ibl < nbl; ++ibl) {
9032
+
9033
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
9034
+ memset(q2, 0, QK_K/4);
9035
+ memset(y[ibl].scales, 0, QK_K/32);
9036
+
9037
+ float max_scale = 0;
9038
+
9039
+ const float * xbl = x + QK_K*ibl;
9040
+ float sumx2 = 0;
9041
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
9042
+ float sigma2 = sumx2/QK_K;
9043
+
9044
+ for (int ib = 0; ib < QK_K/16; ++ib) {
9045
+ const float * xb = xbl + 16*ib;
9046
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
9047
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
9048
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
9049
+ for (int k = 0; k < 2; ++k) {
9050
+ int nflip = 0;
9051
+ uint8_t s = 0;
9052
+ for (int i = 0; i < 8; ++i) {
9053
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
9054
+ else {
9055
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
9056
+ }
9057
+ }
9058
+ if (nflip%2) {
9059
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
9060
+ for (int i = 1; i < 8; ++i) {
9061
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
9062
+ if (ax < min) {
9063
+ min = ax; imin = i;
9064
+ }
9065
+ }
9066
+ xval[8*k+imin] = -xval[8*k+imin];
9067
+ s ^= (1 << imin);
9068
+ }
9069
+ block_signs[k] = s & 127;
9070
+ }
9071
+ float max = xval[0];
9072
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
9073
+ if (!max) {
9074
+ scales[ib] = 0;
9075
+ memset(L, 0, 16);
9076
+ continue;
9077
+ }
9078
+ float best = 0;
9079
+ float scale = max/(2*kMaxQ-1);
9080
+ is_on_grid[0] = is_on_grid[1] = true;
9081
+ for (int is = -9; is <= 9; ++is) {
9082
+ float id = (2*kMaxQ-1+is*0.1f)/max;
9083
+ float this_scale = 1/id;
9084
+ for (int k = 0; k < 2; ++k) {
9085
+ for (int i = 0; i < 8; ++i) {
9086
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
9087
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
9088
+ }
9089
+ uint16_t u = 0;
9090
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
9091
+ int grid_index = kmap_q2xs[u];
9092
+ is_on_grid_aux[k] = true;
9093
+ if (grid_index < 0) {
9094
+ is_on_grid_aux[k] = false;
9095
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9096
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
9097
+ }
9098
+ }
9099
+ float sumqx = 0, sumq2 = 0;
9100
+ for (int i = 0; i < 16; ++i) {
9101
+ float w = weight[i];
9102
+ float q = 2*Laux[i] + 1;
9103
+ sumqx += w*xval[i]*q;
9104
+ sumq2 += w*q*q;
9105
+ }
9106
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
9107
+ scale = sumqx/sumq2; best = scale*sumqx;
9108
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
9109
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
9110
+ }
9111
+ }
9112
+ int n_not_ongrid = 0;
9113
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
9114
+ if (n_not_ongrid > 0 && scale > 0) {
9115
+ float id = 1/scale;
9116
+ for (int k = 0; k < 2; ++k) {
9117
+ if (is_on_grid[k]) continue;
9118
+ uint16_t u = 0;
9119
+ for (int i = 0; i < 8; ++i) {
9120
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
9121
+ l = MAX(0, MIN(kMaxQ-1, l));
9122
+ u |= (l << 2*i);
9123
+ L[8*k + i] = l;
9124
+ }
9125
+ int grid_index = kmap_q2xs[u];
9126
+ if (grid_index < 0) {
9127
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9128
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
9129
+ }
9130
+ }
9131
+ float sumqx = 0, sumq2 = 0;
9132
+ for (int i = 0; i < 16; ++i) {
9133
+ float w = weight[i];
9134
+ float q = 2*L[i] + 1;
9135
+ sumqx += w*xval[i]*q;
9136
+ sumq2 += w*q*q;
9137
+ }
9138
+ if (sumq2 > 0) scale = sumqx/sumq2;
9139
+ }
9140
+ if (scale < 0) {
9141
+ scale = -scale;
9142
+ for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
9143
+ }
9144
+ for (int k = 0; k < 2; ++k) {
9145
+ uint16_t u = 0;
9146
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
9147
+ int grid_index = kmap_q2xs[u];
9148
+ if (grid_index < 0) {
9149
+ printf("Oops: found point %u not on grid:", u);
9150
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
9151
+ printf("\n");
9152
+ GGML_ASSERT(false);
9153
+ }
9154
+ q2[2*ib+k] = grid_index | (block_signs[k] << 9);
9155
+ }
9156
+ GGML_ASSERT(scale >= 0);
9157
+ scales[ib] = scale;
9158
+ max_scale = MAX(max_scale, scale);
9159
+ }
9160
+
9161
+ if (!max_scale) {
9162
+ memset(y[ibl].qs, 0, QK_K/4);
9163
+ continue;
9164
+ }
9165
+
9166
+ float d = max_scale/31;
9167
+ y[ibl].d = GGML_FP32_TO_FP16(d);
9168
+ float id = 1/d;
9169
+ for (int ib = 0; ib < QK_K/16; ++ib) {
9170
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
9171
+ l = MAX(0, MIN(15, l));
9172
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
9173
+ else y[ibl].scales[ib/2] |= (l << 4);
9174
+ }
9175
+ memcpy(y[ibl].qs, q2, QK_K/4);
9176
+
9177
+ }
9178
+ }
9179
+
9180
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
9181
+ (void)hist;
9182
+ GGML_ASSERT(n_per_row%QK_K == 0);
9183
+ int nblock = n_per_row/QK_K;
9184
+ char * qrow = (char *)dst;
9185
+ for (int row = 0; row < nrow; ++row) {
9186
+ quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
9187
+ src += n_per_row;
9188
+ qrow += nblock*sizeof(block_iq2_xxs);
9189
+ }
9190
+ return nrow * nblock * sizeof(block_iq2_xxs);
9191
+ }
9192
+
9193
+ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
9194
+ (void)hist;
9195
+ GGML_ASSERT(n_per_row%QK_K == 0);
9196
+ int nblock = n_per_row/QK_K;
9197
+ char * qrow = (char *)dst;
9198
+ for (int row = 0; row < nrow; ++row) {
9199
+ quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
9200
+ src += n_per_row;
9201
+ qrow += nblock*sizeof(block_iq2_xs);
9202
+ }
9203
+ return nrow * nblock * sizeof(block_iq2_xs);
9204
+ }
9205
+