llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -5,6 +5,8 @@
|
|
5
5
|
#include <string.h>
|
6
6
|
#include <assert.h>
|
7
7
|
#include <float.h>
|
8
|
+
#include <stdlib.h> // for qsort
|
9
|
+
#include <stdio.h> // for GGML_ASSERT
|
8
10
|
|
9
11
|
#ifdef __ARM_NEON
|
10
12
|
|
@@ -272,10 +274,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
272
274
|
|
273
275
|
// vaddvq_s16
|
274
276
|
// vpaddq_s16
|
277
|
+
// vpaddq_s32
|
275
278
|
// vaddvq_s32
|
276
279
|
// vaddvq_f32
|
277
280
|
// vmaxvq_f32
|
278
281
|
// vcvtnq_s32_f32
|
282
|
+
// vzip1_u8
|
283
|
+
// vzip2_u8
|
279
284
|
|
280
285
|
inline static int32_t vaddvq_s16(int16x8_t v) {
|
281
286
|
return
|
@@ -291,6 +296,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
291
296
|
return vcombine_s16(a0, b0);
|
292
297
|
}
|
293
298
|
|
299
|
+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
300
|
+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
301
|
+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
302
|
+
return vcombine_s32(a0, b0);
|
303
|
+
}
|
304
|
+
|
294
305
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
295
306
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
296
307
|
}
|
@@ -316,6 +327,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
316
327
|
return res;
|
317
328
|
}
|
318
329
|
|
330
|
+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
331
|
+
uint8x8_t res;
|
332
|
+
|
333
|
+
res[0] = a[0]; res[1] = b[0];
|
334
|
+
res[2] = a[1]; res[3] = b[1];
|
335
|
+
res[4] = a[2]; res[5] = b[2];
|
336
|
+
res[6] = a[3]; res[7] = b[3];
|
337
|
+
|
338
|
+
return res;
|
339
|
+
}
|
340
|
+
|
341
|
+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
342
|
+
uint8x8_t res;
|
343
|
+
|
344
|
+
res[0] = a[4]; res[1] = b[4];
|
345
|
+
res[2] = a[5]; res[3] = b[5];
|
346
|
+
res[4] = a[6]; res[5] = b[6];
|
347
|
+
res[6] = a[7]; res[7] = b[7];
|
348
|
+
|
349
|
+
return res;
|
350
|
+
}
|
351
|
+
|
319
352
|
// vld1q_s16_x2
|
320
353
|
// vld1q_u8_x2
|
321
354
|
// vld1q_u8_x4
|
@@ -482,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|
482
515
|
quantize_row_q4_0_reference(x, y, k);
|
483
516
|
}
|
484
517
|
|
518
|
+
|
485
519
|
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
|
486
520
|
const int qk = QK4_1;
|
487
521
|
|
@@ -1211,7 +1245,8 @@ static inline int nearest_int(float fval) {
|
|
1211
1245
|
return (i & 0x007fffff) - 0x00400000;
|
1212
1246
|
}
|
1213
1247
|
|
1214
|
-
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type
|
1248
|
+
static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
|
1249
|
+
const float * restrict qw) {
|
1215
1250
|
float max = 0;
|
1216
1251
|
float amax = 0;
|
1217
1252
|
for (int i = 0; i < n; ++i) {
|
@@ -1237,14 +1272,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
1237
1272
|
rmse_type = -rmse_type;
|
1238
1273
|
return_early = true;
|
1239
1274
|
}
|
1240
|
-
int weight_type = rmse_type%2;
|
1241
1275
|
float sumlx = 0;
|
1242
1276
|
float suml2 = 0;
|
1243
1277
|
for (int i = 0; i < n; ++i) {
|
1244
1278
|
int l = nearest_int(iscale * x[i]);
|
1245
1279
|
l = MAX(-nmax, MIN(nmax-1, l));
|
1246
1280
|
L[i] = l + nmax;
|
1247
|
-
float w =
|
1281
|
+
float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
|
1248
1282
|
sumlx += w*x[i]*l;
|
1249
1283
|
suml2 += w*l*l;
|
1250
1284
|
}
|
@@ -1260,7 +1294,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
1260
1294
|
for (int i = 0; i < n; ++i) {
|
1261
1295
|
int l = nearest_int(iscale * x[i]);
|
1262
1296
|
l = MAX(-nmax, MIN(nmax-1, l));
|
1263
|
-
float w =
|
1297
|
+
float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
|
1264
1298
|
sumlx += w*x[i]*l;
|
1265
1299
|
suml2 += w*l*l;
|
1266
1300
|
}
|
@@ -1608,6 +1642,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
|
|
1608
1642
|
return (n/QK_K*sizeof(block_q2_K));
|
1609
1643
|
}
|
1610
1644
|
|
1645
|
+
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
1646
|
+
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
1647
|
+
float rmin, float rdelta, int nstep, bool use_mad) {
|
1648
|
+
float min = x[0];
|
1649
|
+
float max = x[0];
|
1650
|
+
float sum_w = weights ? weights[0] : x[0]*x[0];
|
1651
|
+
float sum_x = sum_w * x[0];
|
1652
|
+
for (int i = 1; i < n; ++i) {
|
1653
|
+
if (x[i] < min) min = x[i];
|
1654
|
+
if (x[i] > max) max = x[i];
|
1655
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1656
|
+
sum_w += w;
|
1657
|
+
sum_x += w * x[i];
|
1658
|
+
}
|
1659
|
+
if (min > 0) {
|
1660
|
+
min = 0;
|
1661
|
+
}
|
1662
|
+
if (max <= min) {
|
1663
|
+
for (int i = 0; i < n; ++i) L[i] = 0;
|
1664
|
+
*the_min = -min;
|
1665
|
+
return 0.f;
|
1666
|
+
}
|
1667
|
+
float iscale = nmax/(max - min);
|
1668
|
+
float scale = 1/iscale;
|
1669
|
+
float best_mad = 0;
|
1670
|
+
for (int i = 0; i < n; ++i) {
|
1671
|
+
int l = nearest_int(iscale*(x[i] - min));
|
1672
|
+
L[i] = MAX(0, MIN(nmax, l));
|
1673
|
+
float diff = scale * L[i] + min - x[i];
|
1674
|
+
diff = use_mad ? fabsf(diff) : diff*diff;
|
1675
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1676
|
+
best_mad += w * diff;
|
1677
|
+
}
|
1678
|
+
if (nstep < 1) {
|
1679
|
+
*the_min = -min;
|
1680
|
+
return scale;
|
1681
|
+
}
|
1682
|
+
for (int is = 0; is <= nstep; ++is) {
|
1683
|
+
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
1684
|
+
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
1685
|
+
for (int i = 0; i < n; ++i) {
|
1686
|
+
int l = nearest_int(iscale*(x[i] - min));
|
1687
|
+
l = MAX(0, MIN(nmax, l));
|
1688
|
+
Laux[i] = l;
|
1689
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1690
|
+
sum_l += w*l;
|
1691
|
+
sum_l2 += w*l*l;
|
1692
|
+
sum_xl += w*l*x[i];
|
1693
|
+
}
|
1694
|
+
float D = sum_w * sum_l2 - sum_l * sum_l;
|
1695
|
+
if (D > 0) {
|
1696
|
+
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
1697
|
+
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
1698
|
+
if (this_min > 0) {
|
1699
|
+
this_min = 0;
|
1700
|
+
this_scale = sum_xl / sum_l2;
|
1701
|
+
}
|
1702
|
+
float mad = 0;
|
1703
|
+
for (int i = 0; i < n; ++i) {
|
1704
|
+
float diff = this_scale * Laux[i] + this_min - x[i];
|
1705
|
+
diff = use_mad ? fabsf(diff) : diff*diff;
|
1706
|
+
float w = weights ? weights[i] : x[i]*x[i];
|
1707
|
+
mad += w * diff;
|
1708
|
+
}
|
1709
|
+
if (mad < best_mad) {
|
1710
|
+
for (int i = 0; i < n; ++i) {
|
1711
|
+
L[i] = Laux[i];
|
1712
|
+
}
|
1713
|
+
best_mad = mad;
|
1714
|
+
scale = this_scale;
|
1715
|
+
min = this_min;
|
1716
|
+
}
|
1717
|
+
}
|
1718
|
+
}
|
1719
|
+
*the_min = -min;
|
1720
|
+
return scale;
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
|
1724
|
+
float max = 0;
|
1725
|
+
for (int i = 0; i < n; ++i) {
|
1726
|
+
max = MAX(max, x[i]);
|
1727
|
+
}
|
1728
|
+
if (!max) { // all zero
|
1729
|
+
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
1730
|
+
return 0.f;
|
1731
|
+
}
|
1732
|
+
float iscale = nmax / max;
|
1733
|
+
for (int i = 0; i < n; ++i) {
|
1734
|
+
L[i] = nearest_int(iscale * x[i]);
|
1735
|
+
}
|
1736
|
+
float scale = 1/iscale;
|
1737
|
+
float best_mse = 0;
|
1738
|
+
for (int i = 0; i < n; ++i) {
|
1739
|
+
float diff = x[i] - scale*L[i];
|
1740
|
+
float w = quant_weights[i];
|
1741
|
+
best_mse += w*diff*diff;
|
1742
|
+
}
|
1743
|
+
for (int is = -4; is <= 4; ++is) {
|
1744
|
+
if (is == 0) continue;
|
1745
|
+
float iscale_is = (0.1f*is + nmax)/max;
|
1746
|
+
float scale_is = 1/iscale_is;
|
1747
|
+
float mse = 0;
|
1748
|
+
for (int i = 0; i < n; ++i) {
|
1749
|
+
int l = nearest_int(iscale_is*x[i]);
|
1750
|
+
l = MIN(nmax, l);
|
1751
|
+
float diff = x[i] - scale_is*l;
|
1752
|
+
float w = quant_weights[i];
|
1753
|
+
mse += w*diff*diff;
|
1754
|
+
}
|
1755
|
+
if (mse < best_mse) {
|
1756
|
+
best_mse = mse;
|
1757
|
+
iscale = iscale_is;
|
1758
|
+
}
|
1759
|
+
}
|
1760
|
+
float sumlx = 0;
|
1761
|
+
float suml2 = 0;
|
1762
|
+
for (int i = 0; i < n; ++i) {
|
1763
|
+
int l = nearest_int(iscale * x[i]);
|
1764
|
+
l = MIN(nmax, l);
|
1765
|
+
L[i] = l;
|
1766
|
+
float w = quant_weights[i];
|
1767
|
+
sumlx += w*x[i]*l;
|
1768
|
+
suml2 += w*l*l;
|
1769
|
+
}
|
1770
|
+
for (int itry = 0; itry < 5; ++itry) {
|
1771
|
+
int n_changed = 0;
|
1772
|
+
for (int i = 0; i < n; ++i) {
|
1773
|
+
float w = quant_weights[i];
|
1774
|
+
float slx = sumlx - w*x[i]*L[i];
|
1775
|
+
float sl2 = suml2 - w*L[i]*L[i];
|
1776
|
+
if (slx > 0 && sl2 > 0) {
|
1777
|
+
int new_l = nearest_int(x[i] * sl2 / slx);
|
1778
|
+
new_l = MIN(nmax, new_l);
|
1779
|
+
if (new_l != L[i]) {
|
1780
|
+
slx += w*x[i]*new_l;
|
1781
|
+
sl2 += w*new_l*new_l;
|
1782
|
+
if (slx*slx*suml2 > sumlx*sumlx*sl2) {
|
1783
|
+
L[i] = new_l; sumlx = slx; suml2 = sl2;
|
1784
|
+
++n_changed;
|
1785
|
+
}
|
1786
|
+
}
|
1787
|
+
}
|
1788
|
+
}
|
1789
|
+
if (!n_changed) {
|
1790
|
+
break;
|
1791
|
+
}
|
1792
|
+
}
|
1793
|
+
return sumlx / suml2;
|
1794
|
+
}
|
1795
|
+
|
1796
|
+
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
1797
|
+
GGML_ASSERT(quant_weights);
|
1798
|
+
assert(k % QK_K == 0);
|
1799
|
+
const int nb = k / QK_K;
|
1800
|
+
const bool requantize = true;
|
1801
|
+
|
1802
|
+
uint8_t L[QK_K];
|
1803
|
+
uint8_t Laux[16];
|
1804
|
+
float mins[QK_K/16];
|
1805
|
+
float scales[QK_K/16];
|
1806
|
+
float sw[QK_K/16];
|
1807
|
+
float weight[QK_K/16];
|
1808
|
+
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
1809
|
+
|
1810
|
+
for (int i = 0; i < nb; i++) {
|
1811
|
+
memset(sw, 0, QK_K/16*sizeof(float));
|
1812
|
+
float sumx2 = 0;
|
1813
|
+
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
1814
|
+
float sigma2 = sumx2/QK_K;
|
1815
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1816
|
+
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
1817
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
1818
|
+
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
1819
|
+
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
1820
|
+
}
|
1821
|
+
|
1822
|
+
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
1823
|
+
float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
1824
|
+
y[i].d = GGML_FP32_TO_FP16(dm);
|
1825
|
+
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
1826
|
+
dm = GGML_FP16_TO_FP32(y[i].d);
|
1827
|
+
mm = GGML_FP16_TO_FP32(y[i].dmin);
|
1828
|
+
|
1829
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1830
|
+
y[i].scales[j] = Ls[j] | (Lm[j] << 4);
|
1831
|
+
}
|
1832
|
+
|
1833
|
+
if (requantize) {
|
1834
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
1835
|
+
const float d = dm * (y[i].scales[j] & 0xF);
|
1836
|
+
if (!d) continue;
|
1837
|
+
const float m = mm * (y[i].scales[j] >> 4);
|
1838
|
+
for (int ii = 0; ii < 16; ++ii) {
|
1839
|
+
int l = nearest_int((x[16*j + ii] + m)/d);
|
1840
|
+
l = MAX(0, MIN(3, l));
|
1841
|
+
L[16*j + ii] = l;
|
1842
|
+
}
|
1843
|
+
}
|
1844
|
+
}
|
1845
|
+
|
1846
|
+
#if QK_K == 256
|
1847
|
+
for (int j = 0; j < QK_K; j += 128) {
|
1848
|
+
for (int l = 0; l < 32; ++l) {
|
1849
|
+
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
1850
|
+
}
|
1851
|
+
}
|
1852
|
+
#else
|
1853
|
+
for (int l = 0; l < 16; ++l) {
|
1854
|
+
y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
|
1855
|
+
}
|
1856
|
+
#endif
|
1857
|
+
|
1858
|
+
x += QK_K;
|
1859
|
+
|
1860
|
+
}
|
1861
|
+
}
|
1862
|
+
|
1863
|
+
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
1864
|
+
(void)hist;
|
1865
|
+
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
1866
|
+
if (!quant_weights) {
|
1867
|
+
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
1868
|
+
}
|
1869
|
+
else {
|
1870
|
+
char * qrow = (char *)dst;
|
1871
|
+
for (int row = 0; row < nrow; ++row) {
|
1872
|
+
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
1873
|
+
src += n_per_row;
|
1874
|
+
qrow += row_size;
|
1875
|
+
}
|
1876
|
+
}
|
1877
|
+
return nrow * row_size;
|
1878
|
+
}
|
1879
|
+
|
1611
1880
|
//========================= 3-bit (de)-quantization
|
1612
1881
|
|
1613
1882
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
@@ -1821,6 +2090,112 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
|
|
1821
2090
|
return (n/QK_K*sizeof(block_q3_K));
|
1822
2091
|
}
|
1823
2092
|
|
2093
|
+
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
2094
|
+
#if QK_K != 256
|
2095
|
+
(void)quant_weights;
|
2096
|
+
quantize_row_q3_K_reference(x, y, n_per_row);
|
2097
|
+
#else
|
2098
|
+
assert(n_per_row % QK_K == 0);
|
2099
|
+
const int nb = n_per_row / QK_K;
|
2100
|
+
|
2101
|
+
int8_t L[QK_K];
|
2102
|
+
float scales[QK_K / 16];
|
2103
|
+
float weight[16];
|
2104
|
+
float sw[QK_K / 16];
|
2105
|
+
int8_t Ls[QK_K / 16];
|
2106
|
+
|
2107
|
+
for (int i = 0; i < nb; i++) {
|
2108
|
+
|
2109
|
+
float sumx2 = 0;
|
2110
|
+
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
2111
|
+
float sigma2 = 2*sumx2/QK_K;
|
2112
|
+
|
2113
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2114
|
+
if (quant_weights) {
|
2115
|
+
const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
|
2116
|
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
2117
|
+
} else {
|
2118
|
+
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
2119
|
+
}
|
2120
|
+
float sumw = 0;
|
2121
|
+
for (int l = 0; l < 16; ++l) sumw += weight[l];
|
2122
|
+
sw[j] = sumw;
|
2123
|
+
|
2124
|
+
scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
|
2125
|
+
|
2126
|
+
}
|
2127
|
+
|
2128
|
+
memset(y[i].scales, 0, 12);
|
2129
|
+
|
2130
|
+
float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
|
2131
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2132
|
+
int l = Ls[j];
|
2133
|
+
if (j < 8) {
|
2134
|
+
y[i].scales[j] = l & 0xF;
|
2135
|
+
} else {
|
2136
|
+
y[i].scales[j-8] |= ((l & 0xF) << 4);
|
2137
|
+
}
|
2138
|
+
l >>= 4;
|
2139
|
+
y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
|
2140
|
+
}
|
2141
|
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
2142
|
+
|
2143
|
+
int8_t sc;
|
2144
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2145
|
+
sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
|
2146
|
+
sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
|
2147
|
+
float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
2148
|
+
if (!d) {
|
2149
|
+
continue;
|
2150
|
+
}
|
2151
|
+
for (int ii = 0; ii < 16; ++ii) {
|
2152
|
+
int l = nearest_int(x[16*j + ii]/d);
|
2153
|
+
l = MAX(-4, MIN(3, l));
|
2154
|
+
L[16*j + ii] = l + 4;
|
2155
|
+
}
|
2156
|
+
}
|
2157
|
+
|
2158
|
+
memset(y[i].hmask, 0, QK_K/8);
|
2159
|
+
// We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
|
2160
|
+
int m = 0;
|
2161
|
+
uint8_t hm = 1;
|
2162
|
+
for (int j = 0; j < QK_K; ++j) {
|
2163
|
+
if (L[j] > 3) {
|
2164
|
+
y[i].hmask[m] |= hm;
|
2165
|
+
L[j] -= 4;
|
2166
|
+
}
|
2167
|
+
if (++m == QK_K/8) {
|
2168
|
+
m = 0; hm <<= 1;
|
2169
|
+
}
|
2170
|
+
}
|
2171
|
+
for (int j = 0; j < QK_K; j += 128) {
|
2172
|
+
for (int l = 0; l < 32; ++l) {
|
2173
|
+
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
x += QK_K;
|
2178
|
+
}
|
2179
|
+
#endif
|
2180
|
+
}
|
2181
|
+
|
2182
|
+
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
2183
|
+
(void)hist;
|
2184
|
+
int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
2185
|
+
if (!quant_weights) {
|
2186
|
+
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
2187
|
+
}
|
2188
|
+
else {
|
2189
|
+
char * qrow = (char *)dst;
|
2190
|
+
for (int row = 0; row < nrow; ++row) {
|
2191
|
+
quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
|
2192
|
+
src += n_per_row;
|
2193
|
+
qrow += row_size;
|
2194
|
+
}
|
2195
|
+
}
|
2196
|
+
return nrow * row_size;
|
2197
|
+
}
|
2198
|
+
|
1824
2199
|
// ====================== 4-bit (de)-quantization
|
1825
2200
|
|
1826
2201
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
@@ -1986,36 +2361,38 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
|
|
1986
2361
|
return (n/QK_K*sizeof(block_q4_K));
|
1987
2362
|
}
|
1988
2363
|
|
1989
|
-
|
1990
|
-
|
1991
|
-
void
|
1992
|
-
|
1993
|
-
|
2364
|
+
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
2365
|
+
#if QK_K != 256
|
2366
|
+
(void)quant_weights;
|
2367
|
+
quantize_row_q4_K_reference(x, y, n_per_row);
|
2368
|
+
#else
|
2369
|
+
assert(n_per_row % QK_K == 0);
|
2370
|
+
const int nb = n_per_row / QK_K;
|
1994
2371
|
|
1995
|
-
#if QK_K == 256
|
1996
2372
|
uint8_t L[QK_K];
|
2373
|
+
uint8_t Laux[32];
|
2374
|
+
float weights[32];
|
1997
2375
|
float mins[QK_K/32];
|
1998
2376
|
float scales[QK_K/32];
|
1999
|
-
float weights[32];
|
2000
|
-
uint8_t Laux[32];
|
2001
|
-
#else
|
2002
|
-
int8_t L[QK_K];
|
2003
|
-
float scales[QK_K/16];
|
2004
|
-
#endif
|
2005
2377
|
|
2006
2378
|
for (int i = 0; i < nb; i++) {
|
2007
2379
|
|
2008
|
-
|
2380
|
+
float sum_x2 = 0;
|
2381
|
+
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
2382
|
+
float sigma2 = sum_x2/QK_K;
|
2383
|
+
float av_x = sqrtf(sigma2);
|
2009
2384
|
|
2010
2385
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2011
2386
|
float max_min = 0;
|
2012
2387
|
for (int j = 0; j < QK_K/32; ++j) {
|
2013
|
-
|
2014
|
-
|
2015
|
-
|
2016
|
-
|
2017
|
-
|
2018
|
-
|
2388
|
+
if (quant_weights) {
|
2389
|
+
const float * qw = quant_weights + QK_K*i + 32*j;
|
2390
|
+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
|
2391
|
+
} else {
|
2392
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2393
|
+
}
|
2394
|
+
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
2395
|
+
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
2019
2396
|
float scale = scales[j];
|
2020
2397
|
if (scale > max_scale) {
|
2021
2398
|
max_scale = scale;
|
@@ -2053,18 +2430,118 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
2053
2430
|
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
2054
2431
|
for (int ii = 0; ii < 32; ++ii) {
|
2055
2432
|
int l = nearest_int((x[32*j + ii] + dm)/d);
|
2056
|
-
l = MAX(0, MIN(
|
2433
|
+
l = MAX(0, MIN(15, l));
|
2057
2434
|
L[32*j + ii] = l;
|
2058
2435
|
}
|
2059
2436
|
}
|
2437
|
+
uint8_t * q = y[i].qs;
|
2438
|
+
for (int j = 0; j < QK_K; j += 64) {
|
2439
|
+
for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
|
2440
|
+
q += 32;
|
2441
|
+
}
|
2060
2442
|
|
2061
|
-
|
2062
|
-
uint8_t * restrict ql = y[i].qs;
|
2063
|
-
memset(qh, 0, QK_K/8);
|
2443
|
+
x += QK_K;
|
2064
2444
|
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
2445
|
+
}
|
2446
|
+
#endif
|
2447
|
+
}
|
2448
|
+
|
2449
|
+
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
2450
|
+
(void)hist;
|
2451
|
+
int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
2452
|
+
if (!quant_weights) {
|
2453
|
+
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
2454
|
+
}
|
2455
|
+
else {
|
2456
|
+
char * qrow = (char *)dst;
|
2457
|
+
for (int row = 0; row < nrow; ++row) {
|
2458
|
+
quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
|
2459
|
+
src += n_per_row;
|
2460
|
+
qrow += row_size;
|
2461
|
+
}
|
2462
|
+
}
|
2463
|
+
return nrow * row_size;
|
2464
|
+
}
|
2465
|
+
|
2466
|
+
// ====================== 5-bit (de)-quantization
|
2467
|
+
|
2468
|
+
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
2469
|
+
assert(k % QK_K == 0);
|
2470
|
+
const int nb = k / QK_K;
|
2471
|
+
|
2472
|
+
#if QK_K == 256
|
2473
|
+
uint8_t L[QK_K];
|
2474
|
+
float mins[QK_K/32];
|
2475
|
+
float scales[QK_K/32];
|
2476
|
+
float weights[32];
|
2477
|
+
uint8_t Laux[32];
|
2478
|
+
#else
|
2479
|
+
int8_t L[QK_K];
|
2480
|
+
float scales[QK_K/16];
|
2481
|
+
#endif
|
2482
|
+
|
2483
|
+
for (int i = 0; i < nb; i++) {
|
2484
|
+
|
2485
|
+
#if QK_K == 256
|
2486
|
+
|
2487
|
+
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2488
|
+
float max_min = 0;
|
2489
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2490
|
+
//scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
2491
|
+
float sum_x2 = 0;
|
2492
|
+
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
2493
|
+
float av_x = sqrtf(sum_x2/32);
|
2494
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2495
|
+
scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
|
2496
|
+
float scale = scales[j];
|
2497
|
+
if (scale > max_scale) {
|
2498
|
+
max_scale = scale;
|
2499
|
+
}
|
2500
|
+
float min = mins[j];
|
2501
|
+
if (min > max_min) {
|
2502
|
+
max_min = min;
|
2503
|
+
}
|
2504
|
+
}
|
2505
|
+
|
2506
|
+
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
2507
|
+
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
2508
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2509
|
+
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
2510
|
+
uint8_t lm = nearest_int(inv_min*mins[j]);
|
2511
|
+
ls = MIN(63, ls);
|
2512
|
+
lm = MIN(63, lm);
|
2513
|
+
if (j < 4) {
|
2514
|
+
y[i].scales[j] = ls;
|
2515
|
+
y[i].scales[j+4] = lm;
|
2516
|
+
} else {
|
2517
|
+
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
2518
|
+
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
2519
|
+
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
2520
|
+
}
|
2521
|
+
}
|
2522
|
+
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
|
2523
|
+
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
|
2524
|
+
|
2525
|
+
uint8_t sc, m;
|
2526
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2527
|
+
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
2528
|
+
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
2529
|
+
if (!d) continue;
|
2530
|
+
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
2531
|
+
for (int ii = 0; ii < 32; ++ii) {
|
2532
|
+
int l = nearest_int((x[32*j + ii] + dm)/d);
|
2533
|
+
l = MAX(0, MIN(31, l));
|
2534
|
+
L[32*j + ii] = l;
|
2535
|
+
}
|
2536
|
+
}
|
2537
|
+
|
2538
|
+
uint8_t * restrict qh = y[i].qh;
|
2539
|
+
uint8_t * restrict ql = y[i].qs;
|
2540
|
+
memset(qh, 0, QK_K/8);
|
2541
|
+
|
2542
|
+
uint8_t m1 = 1, m2 = 2;
|
2543
|
+
for (int n = 0; n < QK_K; n += 64) {
|
2544
|
+
for (int j = 0; j < 32; ++j) {
|
2068
2545
|
int l1 = L[n + j];
|
2069
2546
|
if (l1 > 15) {
|
2070
2547
|
l1 -= 16; qh[j] |= m1;
|
@@ -2081,7 +2558,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
2081
2558
|
#else
|
2082
2559
|
float max_scale = 0, amax = 0;
|
2083
2560
|
for (int j = 0; j < QK_K/16; ++j) {
|
2084
|
-
scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
|
2561
|
+
scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
|
2085
2562
|
float abs_scale = fabsf(scales[j]);
|
2086
2563
|
if (abs_scale > amax) {
|
2087
2564
|
amax = abs_scale;
|
@@ -2192,6 +2669,123 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
|
|
2192
2669
|
return (n/QK_K*sizeof(block_q5_K));
|
2193
2670
|
}
|
2194
2671
|
|
2672
|
+
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
2673
|
+
#if QK_K != 256
|
2674
|
+
(void)quant_weights;
|
2675
|
+
quantize_row_q5_K_reference(x, y, n_per_row);
|
2676
|
+
#else
|
2677
|
+
assert(n_per_row % QK_K == 0);
|
2678
|
+
const int nb = n_per_row / QK_K;
|
2679
|
+
|
2680
|
+
uint8_t L[QK_K];
|
2681
|
+
float mins[QK_K/32];
|
2682
|
+
float scales[QK_K/32];
|
2683
|
+
float weights[32];
|
2684
|
+
uint8_t Laux[32];
|
2685
|
+
|
2686
|
+
for (int i = 0; i < nb; i++) {
|
2687
|
+
|
2688
|
+
float sum_x2 = 0;
|
2689
|
+
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
2690
|
+
float sigma2 = sum_x2/QK_K;
|
2691
|
+
float av_x = sqrtf(sigma2);
|
2692
|
+
|
2693
|
+
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2694
|
+
float max_min = 0;
|
2695
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2696
|
+
if (quant_weights) {
|
2697
|
+
const float * qw = quant_weights + QK_K*i + 32*j;
|
2698
|
+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
|
2699
|
+
} else {
|
2700
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2701
|
+
}
|
2702
|
+
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
2703
|
+
float scale = scales[j];
|
2704
|
+
if (scale > max_scale) {
|
2705
|
+
max_scale = scale;
|
2706
|
+
}
|
2707
|
+
float min = mins[j];
|
2708
|
+
if (min > max_min) {
|
2709
|
+
max_min = min;
|
2710
|
+
}
|
2711
|
+
}
|
2712
|
+
|
2713
|
+
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
2714
|
+
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
2715
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2716
|
+
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
2717
|
+
uint8_t lm = nearest_int(inv_min*mins[j]);
|
2718
|
+
ls = MIN(63, ls);
|
2719
|
+
lm = MIN(63, lm);
|
2720
|
+
if (j < 4) {
|
2721
|
+
y[i].scales[j] = ls;
|
2722
|
+
y[i].scales[j+4] = lm;
|
2723
|
+
} else {
|
2724
|
+
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
2725
|
+
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
2726
|
+
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
2727
|
+
}
|
2728
|
+
}
|
2729
|
+
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
|
2730
|
+
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
|
2731
|
+
|
2732
|
+
uint8_t sc, m;
|
2733
|
+
for (int j = 0; j < QK_K/32; ++j) {
|
2734
|
+
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
2735
|
+
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
|
2736
|
+
if (!d) continue;
|
2737
|
+
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
|
2738
|
+
for (int ii = 0; ii < 32; ++ii) {
|
2739
|
+
int l = nearest_int((x[32*j + ii] + dm)/d);
|
2740
|
+
l = MAX(0, MIN(31, l));
|
2741
|
+
L[32*j + ii] = l;
|
2742
|
+
}
|
2743
|
+
}
|
2744
|
+
|
2745
|
+
uint8_t * restrict qh = y[i].qh;
|
2746
|
+
uint8_t * restrict ql = y[i].qs;
|
2747
|
+
memset(qh, 0, QK_K/8);
|
2748
|
+
|
2749
|
+
uint8_t m1 = 1, m2 = 2;
|
2750
|
+
for (int n = 0; n < QK_K; n += 64) {
|
2751
|
+
for (int j = 0; j < 32; ++j) {
|
2752
|
+
int l1 = L[n + j];
|
2753
|
+
if (l1 > 15) {
|
2754
|
+
l1 -= 16; qh[j] |= m1;
|
2755
|
+
}
|
2756
|
+
int l2 = L[n + j + 32];
|
2757
|
+
if (l2 > 15) {
|
2758
|
+
l2 -= 16; qh[j] |= m2;
|
2759
|
+
}
|
2760
|
+
ql[j] = l1 | (l2 << 4);
|
2761
|
+
}
|
2762
|
+
m1 <<= 2; m2 <<= 2;
|
2763
|
+
ql += 32;
|
2764
|
+
}
|
2765
|
+
|
2766
|
+
x += QK_K;
|
2767
|
+
|
2768
|
+
}
|
2769
|
+
#endif
|
2770
|
+
}
|
2771
|
+
|
2772
|
+
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
2773
|
+
(void)hist;
|
2774
|
+
int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
2775
|
+
if (!quant_weights) {
|
2776
|
+
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
2777
|
+
}
|
2778
|
+
else {
|
2779
|
+
char * qrow = (char *)dst;
|
2780
|
+
for (int row = 0; row < nrow; ++row) {
|
2781
|
+
quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
|
2782
|
+
src += n_per_row;
|
2783
|
+
qrow += row_size;
|
2784
|
+
}
|
2785
|
+
}
|
2786
|
+
return nrow * row_size;
|
2787
|
+
}
|
2788
|
+
|
2195
2789
|
// ====================== 6-bit (de)-quantization
|
2196
2790
|
|
2197
2791
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
@@ -2208,7 +2802,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
2208
2802
|
|
2209
2803
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
2210
2804
|
|
2211
|
-
const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
|
2805
|
+
const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
|
2212
2806
|
scales[ib] = scale;
|
2213
2807
|
|
2214
2808
|
const float abs_scale = fabsf(scale);
|
@@ -2276,68 +2870,365 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
2276
2870
|
}
|
2277
2871
|
}
|
2278
2872
|
|
2279
|
-
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
2280
|
-
assert(k % QK_K == 0);
|
2281
|
-
const int nb = k / QK_K;
|
2873
|
+
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
2874
|
+
assert(k % QK_K == 0);
|
2875
|
+
const int nb = k / QK_K;
|
2876
|
+
|
2877
|
+
for (int i = 0; i < nb; i++) {
|
2878
|
+
|
2879
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
2880
|
+
|
2881
|
+
const uint8_t * restrict ql = x[i].ql;
|
2882
|
+
const uint8_t * restrict qh = x[i].qh;
|
2883
|
+
const int8_t * restrict sc = x[i].scales;
|
2884
|
+
|
2885
|
+
#if QK_K == 256
|
2886
|
+
for (int n = 0; n < QK_K; n += 128) {
|
2887
|
+
for (int l = 0; l < 32; ++l) {
|
2888
|
+
int is = l/16;
|
2889
|
+
const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
2890
|
+
const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
2891
|
+
const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
2892
|
+
const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
2893
|
+
y[l + 0] = d * sc[is + 0] * q1;
|
2894
|
+
y[l + 32] = d * sc[is + 2] * q2;
|
2895
|
+
y[l + 64] = d * sc[is + 4] * q3;
|
2896
|
+
y[l + 96] = d * sc[is + 6] * q4;
|
2897
|
+
}
|
2898
|
+
y += 128;
|
2899
|
+
ql += 64;
|
2900
|
+
qh += 32;
|
2901
|
+
sc += 8;
|
2902
|
+
}
|
2903
|
+
#else
|
2904
|
+
for (int l = 0; l < 16; ++l) {
|
2905
|
+
const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
2906
|
+
const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
2907
|
+
const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
2908
|
+
const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
2909
|
+
y[l+ 0] = d * sc[0] * q1;
|
2910
|
+
y[l+16] = d * sc[1] * q2;
|
2911
|
+
y[l+32] = d * sc[2] * q3;
|
2912
|
+
y[l+48] = d * sc[3] * q4;
|
2913
|
+
}
|
2914
|
+
y += 64;
|
2915
|
+
#endif
|
2916
|
+
|
2917
|
+
}
|
2918
|
+
}
|
2919
|
+
|
2920
|
+
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
2921
|
+
assert(k % QK_K == 0);
|
2922
|
+
block_q6_K * restrict y = vy;
|
2923
|
+
quantize_row_q6_K_reference(x, y, k);
|
2924
|
+
}
|
2925
|
+
|
2926
|
+
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2927
|
+
assert(k % QK_K == 0);
|
2928
|
+
(void)hist; // TODO: collect histograms
|
2929
|
+
|
2930
|
+
for (int j = 0; j < n; j += k) {
|
2931
|
+
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
2932
|
+
quantize_row_q6_K_reference(src + j, y, k);
|
2933
|
+
}
|
2934
|
+
return (n/QK_K*sizeof(block_q6_K));
|
2935
|
+
}
|
2936
|
+
|
2937
|
+
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
2938
|
+
#if QK_K != 256
|
2939
|
+
(void)quant_weights;
|
2940
|
+
quantize_row_q6_K_reference(x, y, n_per_row);
|
2941
|
+
#else
|
2942
|
+
assert(n_per_row % QK_K == 0);
|
2943
|
+
const int nb = n_per_row / QK_K;
|
2944
|
+
|
2945
|
+
int8_t L[QK_K];
|
2946
|
+
float scales[QK_K/16];
|
2947
|
+
//float weights[16];
|
2948
|
+
|
2949
|
+
for (int i = 0; i < nb; i++) {
|
2950
|
+
|
2951
|
+
//float sum_x2 = 0;
|
2952
|
+
//for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
|
2953
|
+
//float sigma2 = sum_x2/QK_K;
|
2954
|
+
|
2955
|
+
float max_scale = 0;
|
2956
|
+
float max_abs_scale = 0;
|
2957
|
+
|
2958
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
2959
|
+
|
2960
|
+
float scale;
|
2961
|
+
if (quant_weights) {
|
2962
|
+
const float * qw = quant_weights + QK_K*i + 16*ib;
|
2963
|
+
//for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
|
2964
|
+
//scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
|
2965
|
+
scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
|
2966
|
+
} else {
|
2967
|
+
scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
|
2968
|
+
}
|
2969
|
+
scales[ib] = scale;
|
2970
|
+
|
2971
|
+
const float abs_scale = fabsf(scale);
|
2972
|
+
if (abs_scale > max_abs_scale) {
|
2973
|
+
max_abs_scale = abs_scale;
|
2974
|
+
max_scale = scale;
|
2975
|
+
}
|
2976
|
+
|
2977
|
+
}
|
2978
|
+
|
2979
|
+
if (!max_abs_scale) {
|
2980
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
2981
|
+
y[i].d = GGML_FP32_TO_FP16(0.f);
|
2982
|
+
x += QK_K;
|
2983
|
+
continue;
|
2984
|
+
}
|
2985
|
+
|
2986
|
+
float iscale = -128.f/max_scale;
|
2987
|
+
y[i].d = GGML_FP32_TO_FP16(1/iscale);
|
2988
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
2989
|
+
y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
|
2990
|
+
}
|
2991
|
+
|
2992
|
+
for (int j = 0; j < QK_K/16; ++j) {
|
2993
|
+
float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
|
2994
|
+
if (!d) {
|
2995
|
+
continue;
|
2996
|
+
}
|
2997
|
+
for (int ii = 0; ii < 16; ++ii) {
|
2998
|
+
int l = nearest_int(x[16*j + ii]/d);
|
2999
|
+
l = MAX(-32, MIN(31, l));
|
3000
|
+
L[16*j + ii] = l + 32;
|
3001
|
+
}
|
3002
|
+
}
|
3003
|
+
|
3004
|
+
uint8_t * restrict ql = y[i].ql;
|
3005
|
+
uint8_t * restrict qh = y[i].qh;
|
3006
|
+
for (int j = 0; j < QK_K; j += 128) {
|
3007
|
+
for (int l = 0; l < 32; ++l) {
|
3008
|
+
const uint8_t q1 = L[j + l + 0] & 0xF;
|
3009
|
+
const uint8_t q2 = L[j + l + 32] & 0xF;
|
3010
|
+
const uint8_t q3 = L[j + l + 64] & 0xF;
|
3011
|
+
const uint8_t q4 = L[j + l + 96] & 0xF;
|
3012
|
+
ql[l+ 0] = q1 | (q3 << 4);
|
3013
|
+
ql[l+32] = q2 | (q4 << 4);
|
3014
|
+
qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
|
3015
|
+
}
|
3016
|
+
ql += 64;
|
3017
|
+
qh += 32;
|
3018
|
+
}
|
3019
|
+
|
3020
|
+
x += QK_K;
|
3021
|
+
|
3022
|
+
}
|
3023
|
+
#endif
|
3024
|
+
}
|
3025
|
+
|
3026
|
+
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3027
|
+
(void)hist;
|
3028
|
+
int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
3029
|
+
if (!quant_weights) {
|
3030
|
+
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
3031
|
+
}
|
3032
|
+
else {
|
3033
|
+
char * qrow = (char *)dst;
|
3034
|
+
for (int row = 0; row < nrow; ++row) {
|
3035
|
+
quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
|
3036
|
+
src += n_per_row;
|
3037
|
+
qrow += row_size;
|
3038
|
+
}
|
3039
|
+
}
|
3040
|
+
return nrow * row_size;
|
3041
|
+
}
|
3042
|
+
|
3043
|
+
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
|
3044
|
+
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
3045
|
+
|
3046
|
+
if (!quant_weights) {
|
3047
|
+
quantize_row_q4_0_reference(x, y, n_per_row);
|
3048
|
+
return;
|
3049
|
+
}
|
3050
|
+
|
3051
|
+
float weight[QK4_0];
|
3052
|
+
int8_t L[QK4_0];
|
3053
|
+
|
3054
|
+
float sum_x2 = 0;
|
3055
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3056
|
+
float sigma2 = sum_x2/n_per_row;
|
3057
|
+
|
3058
|
+
const int nb = n_per_row/QK4_0;
|
3059
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3060
|
+
const float * xb = x + QK4_0 * ib;
|
3061
|
+
const float * qw = quant_weights + QK4_0 * ib;
|
3062
|
+
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3063
|
+
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
|
3064
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3065
|
+
for (int j = 0; j < 16; ++j) {
|
3066
|
+
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
3067
|
+
}
|
3068
|
+
}
|
3069
|
+
}
|
3070
|
+
|
3071
|
+
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3072
|
+
if (!quant_weights) {
|
3073
|
+
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
3074
|
+
}
|
3075
|
+
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
3076
|
+
char * qrow = (char *)dst;
|
3077
|
+
for (int row = 0; row < nrow; ++row) {
|
3078
|
+
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
3079
|
+
src += n_per_row;
|
3080
|
+
qrow += row_size;
|
3081
|
+
}
|
3082
|
+
return nrow * row_size;
|
3083
|
+
}
|
3084
|
+
|
3085
|
+
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
|
3086
|
+
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
2282
3087
|
|
2283
|
-
|
3088
|
+
if (!quant_weights) {
|
3089
|
+
quantize_row_q4_1_reference(x, y, n_per_row);
|
3090
|
+
return;
|
3091
|
+
}
|
2284
3092
|
|
2285
|
-
|
3093
|
+
float weight[QK4_1];
|
3094
|
+
uint8_t L[QK4_1], Laux[QK4_1];
|
2286
3095
|
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
3096
|
+
float sum_x2 = 0;
|
3097
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3098
|
+
float sigma2 = sum_x2/n_per_row;
|
2290
3099
|
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
|
2300
|
-
|
2301
|
-
|
2302
|
-
y[l + 96] = d * sc[is + 6] * q4;
|
2303
|
-
}
|
2304
|
-
y += 128;
|
2305
|
-
ql += 64;
|
2306
|
-
qh += 32;
|
2307
|
-
sc += 8;
|
3100
|
+
const int nb = n_per_row/QK4_1;
|
3101
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3102
|
+
const float * xb = x + QK4_1 * ib;
|
3103
|
+
const float * qw = quant_weights + QK4_1 * ib;
|
3104
|
+
for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3105
|
+
float min;
|
3106
|
+
float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
3107
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3108
|
+
y[ib].m = GGML_FP32_TO_FP16(-min);
|
3109
|
+
for (int j = 0; j < 16; ++j) {
|
3110
|
+
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
2308
3111
|
}
|
2309
|
-
|
2310
|
-
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
3112
|
+
}
|
3113
|
+
}
|
3114
|
+
|
3115
|
+
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3116
|
+
if (!quant_weights) {
|
3117
|
+
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
3118
|
+
}
|
3119
|
+
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
3120
|
+
char * qrow = (char *)dst;
|
3121
|
+
for (int row = 0; row < nrow; ++row) {
|
3122
|
+
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
3123
|
+
src += n_per_row;
|
3124
|
+
qrow += row_size;
|
3125
|
+
}
|
3126
|
+
return nrow * row_size;
|
3127
|
+
}
|
3128
|
+
|
3129
|
+
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
|
3130
|
+
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
3131
|
+
|
3132
|
+
if (!quant_weights) {
|
3133
|
+
quantize_row_q5_0_reference(x, y, n_per_row);
|
3134
|
+
return;
|
3135
|
+
}
|
3136
|
+
|
3137
|
+
float weight[QK5_0];
|
3138
|
+
int8_t L[QK5_0];
|
3139
|
+
|
3140
|
+
float sum_x2 = 0;
|
3141
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3142
|
+
float sigma2 = sum_x2/n_per_row;
|
3143
|
+
|
3144
|
+
const int nb = n_per_row/QK5_0;
|
3145
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3146
|
+
const float * xb = x + QK5_0 * ib;
|
3147
|
+
const float * qw = quant_weights + QK5_0 * ib;
|
3148
|
+
for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3149
|
+
float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
|
3150
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3151
|
+
|
3152
|
+
uint32_t qh = 0;
|
3153
|
+
|
3154
|
+
for (int j = 0; j < 16; ++j) {
|
3155
|
+
const uint8_t xi0 = L[j];
|
3156
|
+
const uint8_t xi1 = L[j+16];
|
3157
|
+
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
3158
|
+
|
3159
|
+
// get the 5-th bit and store it in qh at the right position
|
3160
|
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
3161
|
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
2319
3162
|
}
|
2320
|
-
y += 64;
|
2321
|
-
#endif
|
2322
3163
|
|
3164
|
+
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
2323
3165
|
}
|
2324
3166
|
}
|
2325
3167
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
3168
|
+
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3169
|
+
if (!quant_weights) {
|
3170
|
+
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
3171
|
+
}
|
3172
|
+
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
3173
|
+
char * qrow = (char *)dst;
|
3174
|
+
for (int row = 0; row < nrow; ++row) {
|
3175
|
+
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
3176
|
+
src += n_per_row;
|
3177
|
+
qrow += row_size;
|
3178
|
+
}
|
3179
|
+
return nrow * row_size;
|
2330
3180
|
}
|
2331
3181
|
|
2332
|
-
|
2333
|
-
|
2334
|
-
(void)hist; // TODO: collect histograms
|
3182
|
+
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
|
3183
|
+
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
2335
3184
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
3185
|
+
if (!quant_weights) {
|
3186
|
+
quantize_row_q5_1_reference(x, y, n_per_row);
|
3187
|
+
return;
|
2339
3188
|
}
|
2340
|
-
|
3189
|
+
|
3190
|
+
float weight[QK5_1];
|
3191
|
+
uint8_t L[QK5_1], Laux[QK5_1];
|
3192
|
+
|
3193
|
+
float sum_x2 = 0;
|
3194
|
+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
3195
|
+
float sigma2 = sum_x2/n_per_row;
|
3196
|
+
|
3197
|
+
const int nb = n_per_row/QK5_1;
|
3198
|
+
for (int ib = 0; ib < nb; ++ib) {
|
3199
|
+
const float * xb = x + QK5_1 * ib;
|
3200
|
+
const float * qw = quant_weights + QK5_1 * ib;
|
3201
|
+
for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
3202
|
+
float min;
|
3203
|
+
float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
3204
|
+
y[ib].d = GGML_FP32_TO_FP16(d);
|
3205
|
+
y[ib].m = GGML_FP32_TO_FP16(-min);
|
3206
|
+
|
3207
|
+
uint32_t qh = 0;
|
3208
|
+
for (int j = 0; j < 16; ++j) {
|
3209
|
+
const uint8_t xi0 = L[j];
|
3210
|
+
const uint8_t xi1 = L[j+16];
|
3211
|
+
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
3212
|
+
// get the 5-th bit and store it in qh at the right position
|
3213
|
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
3214
|
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
3215
|
+
}
|
3216
|
+
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
3217
|
+
}
|
3218
|
+
}
|
3219
|
+
|
3220
|
+
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
3221
|
+
if (!quant_weights) {
|
3222
|
+
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
3223
|
+
}
|
3224
|
+
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
3225
|
+
char * qrow = (char *)dst;
|
3226
|
+
for (int row = 0; row < nrow; ++row) {
|
3227
|
+
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
3228
|
+
src += n_per_row;
|
3229
|
+
qrow += row_size;
|
3230
|
+
}
|
3231
|
+
return nrow * row_size;
|
2341
3232
|
}
|
2342
3233
|
|
2343
3234
|
// ====================== "True" 2-bit (de)-quantization
|
@@ -2553,14 +3444,6 @@ static const uint8_t ksigns_iq2xs[128] = {
|
|
2553
3444
|
|
2554
3445
|
static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
2555
3446
|
|
2556
|
-
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
|
2557
|
-
(void)x;
|
2558
|
-
(void)y;
|
2559
|
-
(void)k;
|
2560
|
-
assert(k % QK_K == 0);
|
2561
|
-
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
2562
|
-
}
|
2563
|
-
|
2564
3447
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
2565
3448
|
assert(k % QK_K == 0);
|
2566
3449
|
const int nb = k / QK_K;
|
@@ -2587,33 +3470,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
2587
3470
|
}
|
2588
3471
|
}
|
2589
3472
|
|
2590
|
-
void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
|
2591
|
-
assert(k % QK_K == 0);
|
2592
|
-
block_iq2_xxs * restrict y = vy;
|
2593
|
-
quantize_row_iq2_xxs_reference(x, y, k);
|
2594
|
-
}
|
2595
|
-
|
2596
|
-
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2597
|
-
assert(k % QK_K == 0);
|
2598
|
-
(void)hist; // TODO: collect histograms
|
2599
|
-
|
2600
|
-
for (int j = 0; j < n; j += k) {
|
2601
|
-
block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
|
2602
|
-
quantize_row_iq2_xxs_reference(src + j, y, k);
|
2603
|
-
}
|
2604
|
-
return (n/QK_K*sizeof(block_iq2_xxs));
|
2605
|
-
}
|
2606
|
-
|
2607
3473
|
// ====================== 2.3125 bpw (de)-quantization
|
2608
3474
|
|
2609
|
-
void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
|
2610
|
-
(void)x;
|
2611
|
-
(void)y;
|
2612
|
-
(void)k;
|
2613
|
-
assert(k % QK_K == 0);
|
2614
|
-
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
2615
|
-
}
|
2616
|
-
|
2617
3475
|
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
|
2618
3476
|
assert(k % QK_K == 0);
|
2619
3477
|
const int nb = k / QK_K;
|
@@ -2639,23 +3497,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
2639
3497
|
}
|
2640
3498
|
}
|
2641
3499
|
|
2642
|
-
void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
|
2643
|
-
assert(k % QK_K == 0);
|
2644
|
-
block_iq2_xs * restrict y = vy;
|
2645
|
-
quantize_row_iq2_xs_reference(x, y, k);
|
2646
|
-
}
|
2647
|
-
|
2648
|
-
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2649
|
-
assert(k % QK_K == 0);
|
2650
|
-
(void)hist; // TODO: collect histograms
|
2651
|
-
|
2652
|
-
for (int j = 0; j < n; j += k) {
|
2653
|
-
block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
|
2654
|
-
quantize_row_iq2_xs_reference(src + j, y, k);
|
2655
|
-
}
|
2656
|
-
return (n/QK_K*sizeof(block_iq2_xs));
|
2657
|
-
}
|
2658
|
-
|
2659
3500
|
//===================================== Q8_K ==============================================
|
2660
3501
|
|
2661
3502
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -7554,9 +8395,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
7554
8395
|
|
7555
8396
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
7556
8397
|
|
7557
|
-
|
7558
|
-
|
7559
|
-
|
8398
|
+
ggml_int8x16x4_t q2u;
|
8399
|
+
ggml_int8x16x4_t q2s;
|
8400
|
+
ggml_int8x16x4_t q8b;
|
7560
8401
|
|
7561
8402
|
int32x4x4_t scales32;
|
7562
8403
|
|
@@ -7578,7 +8419,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
7578
8419
|
scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
|
7579
8420
|
int32x4_t sumi = vdupq_n_s32(0);
|
7580
8421
|
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
|
7581
|
-
q8b =
|
8422
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
7582
8423
|
q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
|
7583
8424
|
q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
|
7584
8425
|
q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
|
@@ -7699,3 +8540,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
7699
8540
|
*s = 0.125f * sumf;
|
7700
8541
|
#endif
|
7701
8542
|
}
|
8543
|
+
|
8544
|
+
// ================================ IQ2 quantization =============================================
|
8545
|
+
|
8546
|
+
typedef struct {
|
8547
|
+
uint64_t * grid;
|
8548
|
+
int * map;
|
8549
|
+
uint16_t * neighbours;
|
8550
|
+
} iq2_entry_t;
|
8551
|
+
|
8552
|
+
static iq2_entry_t iq2_data[2] = {
|
8553
|
+
{NULL, NULL, NULL},
|
8554
|
+
{NULL, NULL, NULL},
|
8555
|
+
};
|
8556
|
+
|
8557
|
+
static inline int iq2_data_index(int grid_size) {
|
8558
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
8559
|
+
return grid_size == 256 ? 0 : 1;
|
8560
|
+
}
|
8561
|
+
|
8562
|
+
static int iq2_compare_func(const void * left, const void * right) {
|
8563
|
+
const int * l = (const int *)left;
|
8564
|
+
const int * r = (const int *)right;
|
8565
|
+
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
8566
|
+
}
|
8567
|
+
|
8568
|
+
static void q2xs_init_impl(int grid_size) {
|
8569
|
+
const int gindex = iq2_data_index(grid_size);
|
8570
|
+
if (iq2_data[gindex].grid) {
|
8571
|
+
return;
|
8572
|
+
}
|
8573
|
+
static const uint16_t kgrid_256[256] = {
|
8574
|
+
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
8575
|
+
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
8576
|
+
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
8577
|
+
1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
|
8578
|
+
2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
|
8579
|
+
4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
|
8580
|
+
5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
|
8581
|
+
8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
|
8582
|
+
10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
|
8583
|
+
16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
|
8584
|
+
17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
|
8585
|
+
20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
|
8586
|
+
22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
|
8587
|
+
25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
|
8588
|
+
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
8589
|
+
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
8590
|
+
};
|
8591
|
+
static const uint16_t kgrid_512[512] = {
|
8592
|
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
8593
|
+
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
8594
|
+
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
8595
|
+
352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
|
8596
|
+
640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
|
8597
|
+
1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
|
8598
|
+
1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
|
8599
|
+
2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
|
8600
|
+
2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
|
8601
|
+
4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
|
8602
|
+
4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
|
8603
|
+
5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
|
8604
|
+
5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
|
8605
|
+
8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
|
8606
|
+
8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
|
8607
|
+
10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
|
8608
|
+
16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
|
8609
|
+
16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
|
8610
|
+
16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
|
8611
|
+
17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
|
8612
|
+
18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
|
8613
|
+
20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
|
8614
|
+
21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
|
8615
|
+
22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
|
8616
|
+
24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
|
8617
|
+
32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
|
8618
|
+
33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
|
8619
|
+
33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
|
8620
|
+
35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
|
8621
|
+
37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
|
8622
|
+
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
8623
|
+
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
8624
|
+
};
|
8625
|
+
const int kmap_size = 43692;
|
8626
|
+
const int nwant = 2;
|
8627
|
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
8628
|
+
uint64_t * kgrid_q2xs;
|
8629
|
+
int * kmap_q2xs;
|
8630
|
+
uint16_t * kneighbors_q2xs;
|
8631
|
+
|
8632
|
+
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
8633
|
+
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
8634
|
+
for (int k = 0; k < grid_size; ++k) {
|
8635
|
+
int8_t * pos = (int8_t *)(the_grid + k);
|
8636
|
+
for (int i = 0; i < 8; ++i) {
|
8637
|
+
int l = (kgrid[k] >> 2*i) & 0x3;
|
8638
|
+
pos[i] = 2*l + 1;
|
8639
|
+
}
|
8640
|
+
}
|
8641
|
+
kgrid_q2xs = the_grid;
|
8642
|
+
iq2_data[gindex].grid = the_grid;
|
8643
|
+
kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
|
8644
|
+
iq2_data[gindex].map = kmap_q2xs;
|
8645
|
+
for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
|
8646
|
+
uint64_t aux64;
|
8647
|
+
uint8_t * aux8 = (uint8_t *)&aux64;
|
8648
|
+
for (int i = 0; i < grid_size; ++i) {
|
8649
|
+
aux64 = kgrid_q2xs[i];
|
8650
|
+
uint16_t index = 0;
|
8651
|
+
for (int k=0; k<8; ++k) {
|
8652
|
+
uint16_t q = (aux8[k] - 1)/2;
|
8653
|
+
index |= (q << 2*k);
|
8654
|
+
}
|
8655
|
+
kmap_q2xs[index] = i;
|
8656
|
+
}
|
8657
|
+
int8_t pos[8];
|
8658
|
+
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
8659
|
+
int num_neighbors = 0, num_not_in_map = 0;
|
8660
|
+
for (int i = 0; i < kmap_size; ++i) {
|
8661
|
+
if (kmap_q2xs[i] >= 0) continue;
|
8662
|
+
++num_not_in_map;
|
8663
|
+
for (int k = 0; k < 8; ++k) {
|
8664
|
+
int l = (i >> 2*k) & 0x3;
|
8665
|
+
pos[k] = 2*l + 1;
|
8666
|
+
}
|
8667
|
+
for (int j = 0; j < grid_size; ++j) {
|
8668
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
8669
|
+
int d2 = 0;
|
8670
|
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
8671
|
+
dist2[2*j+0] = d2;
|
8672
|
+
dist2[2*j+1] = j;
|
8673
|
+
}
|
8674
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
8675
|
+
int n = 0; int d2 = dist2[0];
|
8676
|
+
int nhave = 1;
|
8677
|
+
for (int j = 0; j < grid_size; ++j) {
|
8678
|
+
if (dist2[2*j] > d2) {
|
8679
|
+
if (nhave == nwant) break;
|
8680
|
+
d2 = dist2[2*j];
|
8681
|
+
++nhave;
|
8682
|
+
}
|
8683
|
+
++n;
|
8684
|
+
}
|
8685
|
+
num_neighbors += n;
|
8686
|
+
}
|
8687
|
+
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
8688
|
+
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
8689
|
+
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
8690
|
+
int counter = 0;
|
8691
|
+
for (int i = 0; i < kmap_size; ++i) {
|
8692
|
+
if (kmap_q2xs[i] >= 0) continue;
|
8693
|
+
for (int k = 0; k < 8; ++k) {
|
8694
|
+
int l = (i >> 2*k) & 0x3;
|
8695
|
+
pos[k] = 2*l + 1;
|
8696
|
+
}
|
8697
|
+
for (int j = 0; j < grid_size; ++j) {
|
8698
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
8699
|
+
int d2 = 0;
|
8700
|
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
8701
|
+
dist2[2*j+0] = d2;
|
8702
|
+
dist2[2*j+1] = j;
|
8703
|
+
}
|
8704
|
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
8705
|
+
kmap_q2xs[i] = -(counter + 1);
|
8706
|
+
int d2 = dist2[0];
|
8707
|
+
uint16_t * start = &kneighbors_q2xs[counter++];
|
8708
|
+
int n = 0, nhave = 1;
|
8709
|
+
for (int j = 0; j < grid_size; ++j) {
|
8710
|
+
if (dist2[2*j] > d2) {
|
8711
|
+
if (nhave == nwant) break;
|
8712
|
+
d2 = dist2[2*j];
|
8713
|
+
++nhave;
|
8714
|
+
}
|
8715
|
+
kneighbors_q2xs[counter++] = dist2[2*j+1];
|
8716
|
+
++n;
|
8717
|
+
}
|
8718
|
+
*start = n;
|
8719
|
+
}
|
8720
|
+
free(dist2);
|
8721
|
+
}
|
8722
|
+
|
8723
|
+
void ggml_init_iq2_quantization(enum ggml_type type) {
|
8724
|
+
if (type == GGML_TYPE_IQ2_XXS) {
|
8725
|
+
q2xs_init_impl(256);
|
8726
|
+
}
|
8727
|
+
else if (type == GGML_TYPE_IQ2_XS) {
|
8728
|
+
q2xs_init_impl(512);
|
8729
|
+
}
|
8730
|
+
else {
|
8731
|
+
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
8732
|
+
}
|
8733
|
+
}
|
8734
|
+
|
8735
|
+
static void q2xs_deinit_impl(int grid_size) {
|
8736
|
+
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
8737
|
+
const int gindex = iq2_data_index(grid_size);
|
8738
|
+
if (iq2_data[gindex].grid) {
|
8739
|
+
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
8740
|
+
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
8741
|
+
free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
|
8742
|
+
}
|
8743
|
+
}
|
8744
|
+
|
8745
|
+
void ggml_deinit_iq2_quantization(enum ggml_type type) {
|
8746
|
+
if (type == GGML_TYPE_IQ2_XXS) {
|
8747
|
+
q2xs_deinit_impl(256);
|
8748
|
+
}
|
8749
|
+
else if (type == GGML_TYPE_IQ2_XS) {
|
8750
|
+
q2xs_deinit_impl(512);
|
8751
|
+
}
|
8752
|
+
else {
|
8753
|
+
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
8754
|
+
}
|
8755
|
+
}
|
8756
|
+
|
8757
|
+
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
8758
|
+
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
8759
|
+
int num_neighbors = neighbours[0];
|
8760
|
+
GGML_ASSERT(num_neighbors > 0);
|
8761
|
+
float best_d2 = FLT_MAX;
|
8762
|
+
int grid_index = -1;
|
8763
|
+
for (int j = 1; j <= num_neighbors; ++j) {
|
8764
|
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
8765
|
+
float d2 = 0;
|
8766
|
+
for (int i = 0; i < 8; ++i) {
|
8767
|
+
float q = pg[i];
|
8768
|
+
float diff = scale*q - xval[i];
|
8769
|
+
d2 += weight[i]*diff*diff;
|
8770
|
+
}
|
8771
|
+
if (d2 < best_d2) {
|
8772
|
+
best_d2 = d2; grid_index = neighbours[j];
|
8773
|
+
}
|
8774
|
+
}
|
8775
|
+
GGML_ASSERT(grid_index >= 0);
|
8776
|
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
8777
|
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
8778
|
+
return grid_index;
|
8779
|
+
}
|
8780
|
+
|
8781
|
+
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
8782
|
+
|
8783
|
+
const int gindex = iq2_data_index(256);
|
8784
|
+
|
8785
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
8786
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
8787
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
8788
|
+
|
8789
|
+
GGML_ASSERT(quant_weights);
|
8790
|
+
GGML_ASSERT(kgrid_q2xs);
|
8791
|
+
GGML_ASSERT(kmap_q2xs);
|
8792
|
+
GGML_ASSERT(kneighbors_q2xs);
|
8793
|
+
GGML_ASSERT(n%QK_K == 0);
|
8794
|
+
|
8795
|
+
const int kMaxQ = 3;
|
8796
|
+
|
8797
|
+
const int nbl = n/256;
|
8798
|
+
|
8799
|
+
block_iq2_xxs * y = vy;
|
8800
|
+
|
8801
|
+
float scales[QK_K/32];
|
8802
|
+
float weight[32];
|
8803
|
+
float xval[32];
|
8804
|
+
int8_t L[32];
|
8805
|
+
int8_t Laux[32];
|
8806
|
+
float waux[32];
|
8807
|
+
bool is_on_grid[4];
|
8808
|
+
bool is_on_grid_aux[4];
|
8809
|
+
uint8_t block_signs[4];
|
8810
|
+
uint32_t q2[2*(QK_K/32)];
|
8811
|
+
|
8812
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
8813
|
+
|
8814
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
8815
|
+
memset(q2, 0, QK_K/4);
|
8816
|
+
|
8817
|
+
float max_scale = 0;
|
8818
|
+
|
8819
|
+
const float * xbl = x + QK_K*ibl;
|
8820
|
+
float sumx2 = 0;
|
8821
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
8822
|
+
float sigma2 = sumx2/QK_K;
|
8823
|
+
|
8824
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
8825
|
+
const float * xb = xbl + 32*ib;
|
8826
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
8827
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
8828
|
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
8829
|
+
for (int k = 0; k < 4; ++k) {
|
8830
|
+
int nflip = 0;
|
8831
|
+
uint8_t s = 0;
|
8832
|
+
for (int i = 0; i < 8; ++i) {
|
8833
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
8834
|
+
else {
|
8835
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
8836
|
+
}
|
8837
|
+
}
|
8838
|
+
if (nflip%2) {
|
8839
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
8840
|
+
for (int i = 1; i < 8; ++i) {
|
8841
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
8842
|
+
if (ax < min) {
|
8843
|
+
min = ax; imin = i;
|
8844
|
+
}
|
8845
|
+
}
|
8846
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
8847
|
+
s ^= (1 << imin);
|
8848
|
+
}
|
8849
|
+
block_signs[k] = s & 127;
|
8850
|
+
}
|
8851
|
+
float max = xval[0];
|
8852
|
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
8853
|
+
if (!max) {
|
8854
|
+
scales[ib] = 0;
|
8855
|
+
memset(L, 0, 32);
|
8856
|
+
continue;
|
8857
|
+
}
|
8858
|
+
float best = 0;
|
8859
|
+
float scale = max/(2*kMaxQ-1);
|
8860
|
+
for (int is = -9; is <= 9; ++is) {
|
8861
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
8862
|
+
float this_scale = 1/id;
|
8863
|
+
for (int k = 0; k < 4; ++k) {
|
8864
|
+
for (int i = 0; i < 8; ++i) {
|
8865
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
8866
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
8867
|
+
}
|
8868
|
+
uint16_t u = 0;
|
8869
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
8870
|
+
int grid_index = kmap_q2xs[u];
|
8871
|
+
is_on_grid_aux[k] = true;
|
8872
|
+
if (grid_index < 0) {
|
8873
|
+
is_on_grid_aux[k] = false;
|
8874
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
8875
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
8876
|
+
}
|
8877
|
+
}
|
8878
|
+
float sumqx = 0, sumq2 = 0;
|
8879
|
+
for (int i = 0; i < 32; ++i) {
|
8880
|
+
float w = weight[i];
|
8881
|
+
float q = 2*Laux[i] + 1;
|
8882
|
+
sumqx += w*xval[i]*q;
|
8883
|
+
sumq2 += w*q*q;
|
8884
|
+
}
|
8885
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
8886
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
8887
|
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
8888
|
+
for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
8889
|
+
}
|
8890
|
+
}
|
8891
|
+
int n_not_ongrid = 0;
|
8892
|
+
for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
8893
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
8894
|
+
float id = 1/scale;
|
8895
|
+
for (int k = 0; k < 4; ++k) {
|
8896
|
+
if (is_on_grid[k]) continue;
|
8897
|
+
uint16_t u = 0;
|
8898
|
+
for (int i = 0; i < 8; ++i) {
|
8899
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
8900
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
8901
|
+
u |= (l << 2*i);
|
8902
|
+
}
|
8903
|
+
int grid_index = kmap_q2xs[u];
|
8904
|
+
if (grid_index < 0) {
|
8905
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
8906
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
8907
|
+
}
|
8908
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
|
8909
|
+
for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
|
8910
|
+
}
|
8911
|
+
float sumqx = 0, sumq2 = 0;
|
8912
|
+
for (int i = 0; i < 32; ++i) {
|
8913
|
+
float w = weight[i];
|
8914
|
+
float q = 2*L[i] + 1;
|
8915
|
+
sumqx += w*xval[i]*q;
|
8916
|
+
sumq2 += w*q*q;
|
8917
|
+
}
|
8918
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
8919
|
+
}
|
8920
|
+
if (scale < 0) {
|
8921
|
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
8922
|
+
// and correspondingly flip quant signs.
|
8923
|
+
scale = -scale;
|
8924
|
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
8925
|
+
}
|
8926
|
+
for (int k = 0; k < 4; ++k) {
|
8927
|
+
uint16_t u = 0;
|
8928
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
8929
|
+
int grid_index = kmap_q2xs[u];
|
8930
|
+
if (grid_index < 0) {
|
8931
|
+
printf("Oops: found point %u not on grid:", u);
|
8932
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
8933
|
+
printf("\n");
|
8934
|
+
GGML_ASSERT(false);
|
8935
|
+
}
|
8936
|
+
q2[2*ib+0] |= (grid_index << 8*k);
|
8937
|
+
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
8938
|
+
}
|
8939
|
+
GGML_ASSERT(scale >= 0);
|
8940
|
+
scales[ib] = scale;
|
8941
|
+
max_scale = MAX(max_scale, scale);
|
8942
|
+
}
|
8943
|
+
|
8944
|
+
if (!max_scale) {
|
8945
|
+
memset(y[ibl].qs, 0, QK_K/4);
|
8946
|
+
continue;
|
8947
|
+
}
|
8948
|
+
|
8949
|
+
float d = max_scale/31;
|
8950
|
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
8951
|
+
float id = 1/d;
|
8952
|
+
float sumqx = 0, sumq2 = 0;
|
8953
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
8954
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
8955
|
+
l = MAX(0, MIN(15, l));
|
8956
|
+
q2[2*ib+1] |= ((uint32_t)l << 28);
|
8957
|
+
const float * xb = xbl + 32*ib;
|
8958
|
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
8959
|
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
8960
|
+
const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
|
8961
|
+
const float db = d * (1 + 2*l);
|
8962
|
+
uint32_t u = 0;
|
8963
|
+
for (int k = 0; k < 4; ++k) {
|
8964
|
+
const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
|
8965
|
+
const float * xk = xb + 8*k;
|
8966
|
+
const float * wk = weight + 8*k;
|
8967
|
+
const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
8968
|
+
float best_mse = 0; int best_index = aux8[k];
|
8969
|
+
for (int j = 0; j < 8; ++j) {
|
8970
|
+
float diff = db * grid[j] * signs[j] - xk[j];
|
8971
|
+
best_mse += wk[j] * diff * diff;
|
8972
|
+
}
|
8973
|
+
for (int idx = 0; idx < 256; ++idx) {
|
8974
|
+
grid = (const uint8_t *)(kgrid_q2xs + idx);
|
8975
|
+
float mse = 0;
|
8976
|
+
for (int j = 0; j < 8; ++j) {
|
8977
|
+
float diff = db * grid[j] * signs[j] - xk[j];
|
8978
|
+
mse += wk[j] * diff * diff;
|
8979
|
+
}
|
8980
|
+
if (mse < best_mse) {
|
8981
|
+
best_mse = mse; best_index = idx;
|
8982
|
+
}
|
8983
|
+
}
|
8984
|
+
u |= (best_index << 8*k);
|
8985
|
+
grid = (const uint8_t *)(kgrid_q2xs + best_index);
|
8986
|
+
//grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
8987
|
+
for (int j = 0; j < 8; ++j) {
|
8988
|
+
float q = db * grid[j] * signs[j];
|
8989
|
+
sumqx += wk[j] * q * xk[j];
|
8990
|
+
sumq2 += wk[j] * q * q;
|
8991
|
+
}
|
8992
|
+
}
|
8993
|
+
q2[2*ib] = u;
|
8994
|
+
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
8995
|
+
}
|
8996
|
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
8997
|
+
}
|
8998
|
+
}
|
8999
|
+
|
9000
|
+
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
9001
|
+
|
9002
|
+
const int gindex = iq2_data_index(512);
|
9003
|
+
|
9004
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
9005
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
9006
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
9007
|
+
|
9008
|
+
GGML_ASSERT(quant_weights);
|
9009
|
+
GGML_ASSERT(kmap_q2xs);
|
9010
|
+
GGML_ASSERT(kgrid_q2xs);
|
9011
|
+
GGML_ASSERT(kneighbors_q2xs);
|
9012
|
+
GGML_ASSERT(n%QK_K == 0);
|
9013
|
+
|
9014
|
+
const int kMaxQ = 3;
|
9015
|
+
|
9016
|
+
const int nbl = n/256;
|
9017
|
+
|
9018
|
+
block_iq2_xs * y = vy;
|
9019
|
+
|
9020
|
+
float scales[QK_K/16];
|
9021
|
+
float weight[16];
|
9022
|
+
float xval[16];
|
9023
|
+
int8_t L[16];
|
9024
|
+
int8_t Laux[16];
|
9025
|
+
float waux[16];
|
9026
|
+
bool is_on_grid[2];
|
9027
|
+
bool is_on_grid_aux[2];
|
9028
|
+
uint8_t block_signs[2];
|
9029
|
+
uint16_t q2[2*(QK_K/16)];
|
9030
|
+
|
9031
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
9032
|
+
|
9033
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
9034
|
+
memset(q2, 0, QK_K/4);
|
9035
|
+
memset(y[ibl].scales, 0, QK_K/32);
|
9036
|
+
|
9037
|
+
float max_scale = 0;
|
9038
|
+
|
9039
|
+
const float * xbl = x + QK_K*ibl;
|
9040
|
+
float sumx2 = 0;
|
9041
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
9042
|
+
float sigma2 = sumx2/QK_K;
|
9043
|
+
|
9044
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
9045
|
+
const float * xb = xbl + 16*ib;
|
9046
|
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
9047
|
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
9048
|
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
9049
|
+
for (int k = 0; k < 2; ++k) {
|
9050
|
+
int nflip = 0;
|
9051
|
+
uint8_t s = 0;
|
9052
|
+
for (int i = 0; i < 8; ++i) {
|
9053
|
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
9054
|
+
else {
|
9055
|
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
9056
|
+
}
|
9057
|
+
}
|
9058
|
+
if (nflip%2) {
|
9059
|
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
9060
|
+
for (int i = 1; i < 8; ++i) {
|
9061
|
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
9062
|
+
if (ax < min) {
|
9063
|
+
min = ax; imin = i;
|
9064
|
+
}
|
9065
|
+
}
|
9066
|
+
xval[8*k+imin] = -xval[8*k+imin];
|
9067
|
+
s ^= (1 << imin);
|
9068
|
+
}
|
9069
|
+
block_signs[k] = s & 127;
|
9070
|
+
}
|
9071
|
+
float max = xval[0];
|
9072
|
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
9073
|
+
if (!max) {
|
9074
|
+
scales[ib] = 0;
|
9075
|
+
memset(L, 0, 16);
|
9076
|
+
continue;
|
9077
|
+
}
|
9078
|
+
float best = 0;
|
9079
|
+
float scale = max/(2*kMaxQ-1);
|
9080
|
+
is_on_grid[0] = is_on_grid[1] = true;
|
9081
|
+
for (int is = -9; is <= 9; ++is) {
|
9082
|
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
9083
|
+
float this_scale = 1/id;
|
9084
|
+
for (int k = 0; k < 2; ++k) {
|
9085
|
+
for (int i = 0; i < 8; ++i) {
|
9086
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
9087
|
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
9088
|
+
}
|
9089
|
+
uint16_t u = 0;
|
9090
|
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
9091
|
+
int grid_index = kmap_q2xs[u];
|
9092
|
+
is_on_grid_aux[k] = true;
|
9093
|
+
if (grid_index < 0) {
|
9094
|
+
is_on_grid_aux[k] = false;
|
9095
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
9096
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
9097
|
+
}
|
9098
|
+
}
|
9099
|
+
float sumqx = 0, sumq2 = 0;
|
9100
|
+
for (int i = 0; i < 16; ++i) {
|
9101
|
+
float w = weight[i];
|
9102
|
+
float q = 2*Laux[i] + 1;
|
9103
|
+
sumqx += w*xval[i]*q;
|
9104
|
+
sumq2 += w*q*q;
|
9105
|
+
}
|
9106
|
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
9107
|
+
scale = sumqx/sumq2; best = scale*sumqx;
|
9108
|
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
9109
|
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
9110
|
+
}
|
9111
|
+
}
|
9112
|
+
int n_not_ongrid = 0;
|
9113
|
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
9114
|
+
if (n_not_ongrid > 0 && scale > 0) {
|
9115
|
+
float id = 1/scale;
|
9116
|
+
for (int k = 0; k < 2; ++k) {
|
9117
|
+
if (is_on_grid[k]) continue;
|
9118
|
+
uint16_t u = 0;
|
9119
|
+
for (int i = 0; i < 8; ++i) {
|
9120
|
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
9121
|
+
l = MAX(0, MIN(kMaxQ-1, l));
|
9122
|
+
u |= (l << 2*i);
|
9123
|
+
L[8*k + i] = l;
|
9124
|
+
}
|
9125
|
+
int grid_index = kmap_q2xs[u];
|
9126
|
+
if (grid_index < 0) {
|
9127
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
9128
|
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
9129
|
+
}
|
9130
|
+
}
|
9131
|
+
float sumqx = 0, sumq2 = 0;
|
9132
|
+
for (int i = 0; i < 16; ++i) {
|
9133
|
+
float w = weight[i];
|
9134
|
+
float q = 2*L[i] + 1;
|
9135
|
+
sumqx += w*xval[i]*q;
|
9136
|
+
sumq2 += w*q*q;
|
9137
|
+
}
|
9138
|
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
9139
|
+
}
|
9140
|
+
if (scale < 0) {
|
9141
|
+
scale = -scale;
|
9142
|
+
for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
9143
|
+
}
|
9144
|
+
for (int k = 0; k < 2; ++k) {
|
9145
|
+
uint16_t u = 0;
|
9146
|
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
9147
|
+
int grid_index = kmap_q2xs[u];
|
9148
|
+
if (grid_index < 0) {
|
9149
|
+
printf("Oops: found point %u not on grid:", u);
|
9150
|
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
9151
|
+
printf("\n");
|
9152
|
+
GGML_ASSERT(false);
|
9153
|
+
}
|
9154
|
+
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
|
9155
|
+
}
|
9156
|
+
GGML_ASSERT(scale >= 0);
|
9157
|
+
scales[ib] = scale;
|
9158
|
+
max_scale = MAX(max_scale, scale);
|
9159
|
+
}
|
9160
|
+
|
9161
|
+
if (!max_scale) {
|
9162
|
+
memset(y[ibl].qs, 0, QK_K/4);
|
9163
|
+
continue;
|
9164
|
+
}
|
9165
|
+
|
9166
|
+
float d = max_scale/31;
|
9167
|
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
9168
|
+
float id = 1/d;
|
9169
|
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
9170
|
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
9171
|
+
l = MAX(0, MIN(15, l));
|
9172
|
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
9173
|
+
else y[ibl].scales[ib/2] |= (l << 4);
|
9174
|
+
}
|
9175
|
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
9176
|
+
|
9177
|
+
}
|
9178
|
+
}
|
9179
|
+
|
9180
|
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
9181
|
+
(void)hist;
|
9182
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
9183
|
+
int nblock = n_per_row/QK_K;
|
9184
|
+
char * qrow = (char *)dst;
|
9185
|
+
for (int row = 0; row < nrow; ++row) {
|
9186
|
+
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
9187
|
+
src += n_per_row;
|
9188
|
+
qrow += nblock*sizeof(block_iq2_xxs);
|
9189
|
+
}
|
9190
|
+
return nrow * nblock * sizeof(block_iq2_xxs);
|
9191
|
+
}
|
9192
|
+
|
9193
|
+
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
9194
|
+
(void)hist;
|
9195
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
9196
|
+
int nblock = n_per_row/QK_K;
|
9197
|
+
char * qrow = (char *)dst;
|
9198
|
+
for (int row = 0; row < nrow; ++row) {
|
9199
|
+
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
9200
|
+
src += n_per_row;
|
9201
|
+
qrow += nblock*sizeof(block_iq2_xs);
|
9202
|
+
}
|
9203
|
+
return nrow * nblock * sizeof(block_iq2_xs);
|
9204
|
+
}
|
9205
|
+
|