llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
132
132
|
}
|
133
133
|
|
134
134
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
135
|
-
#if __AVXVNNI__
|
135
|
+
#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
|
136
136
|
const __m256i zero = _mm256_setzero_si256();
|
137
137
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
138
138
|
return _mm256_cvtepi32_ps(summed_pairs);
|
@@ -3474,6 +3474,65 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
3474
3474
|
}
|
3475
3475
|
}
|
3476
3476
|
|
3477
|
+
void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
|
3478
|
+
assert(k % QK_K == 0);
|
3479
|
+
const int nb = k / QK_K;
|
3480
|
+
|
3481
|
+
float delta[4];
|
3482
|
+
uint16_t idx[4];
|
3483
|
+
|
3484
|
+
#if QK_K != 64
|
3485
|
+
iq1m_scale_t scale;
|
3486
|
+
#endif
|
3487
|
+
|
3488
|
+
for (int i = 0; i < nb; i++) {
|
3489
|
+
|
3490
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
3491
|
+
#if QK_K == 64
|
3492
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
3493
|
+
#else
|
3494
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
3495
|
+
const float d = GGML_FP16_TO_FP32(scale.f16);
|
3496
|
+
#endif
|
3497
|
+
const uint8_t * qs = x[i].qs;
|
3498
|
+
const uint8_t * qh = x[i].qh;
|
3499
|
+
|
3500
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
3501
|
+
#if QK_K == 64
|
3502
|
+
const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
|
3503
|
+
const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
|
3504
|
+
#else
|
3505
|
+
const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
|
3506
|
+
const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
|
3507
|
+
#endif
|
3508
|
+
idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
|
3509
|
+
idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
|
3510
|
+
idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
|
3511
|
+
idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
|
3512
|
+
delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3513
|
+
delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3514
|
+
delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3515
|
+
delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
|
3516
|
+
for (int l = 0; l < 2; ++l) {
|
3517
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3518
|
+
for (int j = 0; j < 8; ++j) {
|
3519
|
+
y[j] = dl1 * (grid[j] + delta[l]);
|
3520
|
+
}
|
3521
|
+
y += 8;
|
3522
|
+
}
|
3523
|
+
for (int l = 2; l < 4; ++l) {
|
3524
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
3525
|
+
for (int j = 0; j < 8; ++j) {
|
3526
|
+
y[j] = dl2 * (grid[j] + delta[l]);
|
3527
|
+
}
|
3528
|
+
y += 8;
|
3529
|
+
}
|
3530
|
+
qs += 4;
|
3531
|
+
qh += 2;
|
3532
|
+
}
|
3533
|
+
}
|
3534
|
+
}
|
3535
|
+
|
3477
3536
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
3478
3537
|
|
3479
3538
|
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
|
@@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
9695
9754
|
#endif
|
9696
9755
|
}
|
9697
9756
|
|
9757
|
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9758
|
+
assert(n % QK_K == 0);
|
9759
|
+
assert(nrc == 1);
|
9760
|
+
UNUSED(nrc);
|
9761
|
+
UNUSED(bx);
|
9762
|
+
UNUSED(by);
|
9763
|
+
UNUSED(bs);
|
9764
|
+
|
9765
|
+
const block_iq1_m * restrict x = vx;
|
9766
|
+
const block_q8_K * restrict y = vy;
|
9767
|
+
|
9768
|
+
const int nb = n / QK_K;
|
9769
|
+
|
9770
|
+
#if QK_K != 64
|
9771
|
+
iq1m_scale_t scale;
|
9772
|
+
#endif
|
9773
|
+
|
9774
|
+
#if defined __ARM_NEON
|
9775
|
+
|
9776
|
+
#if QK_K == 64
|
9777
|
+
const int32x4_t mask = vdupq_n_s32(0xf);
|
9778
|
+
#else
|
9779
|
+
const int32x4_t mask = vdupq_n_s32(0x7);
|
9780
|
+
#endif
|
9781
|
+
const int32x4_t mone = vdupq_n_s32(1);
|
9782
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
9783
|
+
|
9784
|
+
ggml_int8x16x4_t deltas;
|
9785
|
+
deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
|
9786
|
+
deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
|
9787
|
+
deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
|
9788
|
+
deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
|
9789
|
+
|
9790
|
+
ggml_int8x16x4_t q1b;
|
9791
|
+
ggml_int8x16x4_t q8b;
|
9792
|
+
|
9793
|
+
uint32_t aux32;
|
9794
|
+
const uint8_t * aux8 = (const uint8_t *)&aux32;
|
9795
|
+
|
9796
|
+
float sumf = 0;
|
9797
|
+
for (int i = 0; i < nb; ++i) {
|
9798
|
+
|
9799
|
+
const int8_t * q8 = y[i].qs;
|
9800
|
+
const uint8_t * qs = x[i].qs;
|
9801
|
+
const uint8_t * qh = x[i].qh;
|
9802
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9803
|
+
|
9804
|
+
#if QK_K != 64
|
9805
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9806
|
+
#endif
|
9807
|
+
|
9808
|
+
int32x4_t sumi1 = mzero;
|
9809
|
+
int32x4_t sumi2 = mzero;
|
9810
|
+
|
9811
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9812
|
+
|
9813
|
+
q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
|
9814
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
|
9815
|
+
q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
|
9816
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
|
9817
|
+
q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
|
9818
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
|
9819
|
+
q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
|
9820
|
+
vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
|
9821
|
+
|
9822
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
9823
|
+
|
9824
|
+
const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
|
9825
|
+
const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
|
9826
|
+
const int32x4_t p12 = vpaddq_s32(p1, p2);
|
9827
|
+
|
9828
|
+
const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
|
9829
|
+
aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
|
9830
|
+
|
9831
|
+
const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
|
9832
|
+
const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
|
9833
|
+
const int32x4_t p34 = vpaddq_s32(p3, p4);
|
9834
|
+
|
9835
|
+
#if QK_K == 64
|
9836
|
+
int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
|
9837
|
+
#else
|
9838
|
+
int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
|
9839
|
+
#endif
|
9840
|
+
scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
|
9841
|
+
|
9842
|
+
sumi1 = vmlaq_s32(sumi1, scales_4, p12);
|
9843
|
+
sumi2 = vmlaq_s32(sumi2, scales_4, p34);
|
9844
|
+
|
9845
|
+
qs += 8; qh += 4;
|
9846
|
+
|
9847
|
+
}
|
9848
|
+
|
9849
|
+
#if QK_K == 64
|
9850
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
9851
|
+
#else
|
9852
|
+
sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
9853
|
+
#endif
|
9854
|
+
}
|
9855
|
+
|
9856
|
+
*s = sumf;
|
9857
|
+
|
9858
|
+
#elif defined __AVX2__
|
9859
|
+
|
9860
|
+
#if QK_K == 64
|
9861
|
+
const __m256i mask = _mm256_set1_epi16(0xf);
|
9862
|
+
#else
|
9863
|
+
const __m256i mask = _mm256_set1_epi16(0x7);
|
9864
|
+
#endif
|
9865
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
9866
|
+
|
9867
|
+
__m256 accum1 = _mm256_setzero_ps();
|
9868
|
+
__m256 accum2 = _mm256_setzero_ps();
|
9869
|
+
for (int i = 0; i < nb; ++i) {
|
9870
|
+
|
9871
|
+
const int8_t * q8 = y[i].qs;
|
9872
|
+
const uint8_t * qs = x[i].qs;
|
9873
|
+
const uint8_t * qh = x[i].qh;
|
9874
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9875
|
+
|
9876
|
+
#if QK_K != 64
|
9877
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9878
|
+
#endif
|
9879
|
+
|
9880
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
9881
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
9882
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
9883
|
+
const __m256i q1b_1 = _mm256_set_epi64x(
|
9884
|
+
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
9885
|
+
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
9886
|
+
);
|
9887
|
+
const __m256i q1b_2 = _mm256_set_epi64x(
|
9888
|
+
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
9889
|
+
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
9890
|
+
);
|
9891
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9892
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
9893
|
+
|
9894
|
+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
9895
|
+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
9896
|
+
|
9897
|
+
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9898
|
+
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
9899
|
+
qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9900
|
+
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
9901
|
+
const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9902
|
+
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
9903
|
+
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
9904
|
+
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
9905
|
+
|
9906
|
+
const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
|
9907
|
+
const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
|
9908
|
+
#if QK_K == 64
|
9909
|
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0));
|
9910
|
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
|
9911
|
+
#else
|
9912
|
+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
|
9913
|
+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
|
9914
|
+
#endif
|
9915
|
+
scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
|
9916
|
+
scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
|
9917
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
9918
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
9919
|
+
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
9920
|
+
const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
|
9921
|
+
|
9922
|
+
sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
|
9923
|
+
sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
|
9924
|
+
|
9925
|
+
qs += 8; qh += 4;
|
9926
|
+
}
|
9927
|
+
|
9928
|
+
#if QK_K == 64
|
9929
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
9930
|
+
#else
|
9931
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
9932
|
+
#endif
|
9933
|
+
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
|
9934
|
+
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
|
9935
|
+
|
9936
|
+
}
|
9937
|
+
|
9938
|
+
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
9939
|
+
|
9940
|
+
#else
|
9941
|
+
|
9942
|
+
int sum1[2], sum2[2], delta[4];
|
9943
|
+
|
9944
|
+
float sumf = 0;
|
9945
|
+
for (int i = 0; i < nb; i++) {
|
9946
|
+
|
9947
|
+
const int8_t * q8 = y[i].qs;
|
9948
|
+
const uint8_t * qs = x[i].qs;
|
9949
|
+
const uint8_t * qh = x[i].qh;
|
9950
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
9951
|
+
|
9952
|
+
#if QK_K != 64
|
9953
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
9954
|
+
#endif
|
9955
|
+
|
9956
|
+
int sumi1 = 0, sumi2 = 0;
|
9957
|
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
9958
|
+
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
9959
|
+
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
9960
|
+
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
9961
|
+
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
9962
|
+
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
9963
|
+
for (int l = 0; l < 4; ++l) {
|
9964
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
9965
|
+
int lsum1 = 0, lsum2 = 0;
|
9966
|
+
for (int j = 0; j < 8; ++j) {
|
9967
|
+
lsum1 += q8[j] * grid[j];
|
9968
|
+
lsum2 += q8[j];
|
9969
|
+
}
|
9970
|
+
q8 += 8;
|
9971
|
+
sum1[l/2] += lsum1;
|
9972
|
+
sum2[l/2] += lsum2*delta[l];
|
9973
|
+
}
|
9974
|
+
#if QK_K == 64
|
9975
|
+
const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
|
9976
|
+
const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
|
9977
|
+
#else
|
9978
|
+
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
9979
|
+
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
9980
|
+
#endif
|
9981
|
+
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
9982
|
+
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
9983
|
+
qs += 4;
|
9984
|
+
qh += 2;
|
9985
|
+
}
|
9986
|
+
|
9987
|
+
#if QK_K == 64
|
9988
|
+
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
9989
|
+
#else
|
9990
|
+
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
9991
|
+
#endif
|
9992
|
+
}
|
9993
|
+
|
9994
|
+
*s = sumf;
|
9995
|
+
|
9996
|
+
#endif
|
9997
|
+
}
|
9998
|
+
|
9698
9999
|
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
9699
10000
|
assert(nrc == 1);
|
9700
10001
|
UNUSED(nrc);
|
@@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = {
|
|
9938
10239
|
};
|
9939
10240
|
|
9940
10241
|
static inline int iq2_data_index(enum ggml_type type) {
|
9941
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10242
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
9942
10243
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
9943
10244
|
type == GGML_TYPE_IQ2_XS ? 1 :
|
9944
|
-
type == GGML_TYPE_IQ1_S
|
10245
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
|
9945
10246
|
}
|
9946
10247
|
|
9947
10248
|
static inline int iq2_grid_size(enum ggml_type type) {
|
9948
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10249
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
9949
10250
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
9950
10251
|
type == GGML_TYPE_IQ2_XS ? 512 :
|
9951
|
-
type == GGML_TYPE_IQ1_S
|
10252
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
|
9952
10253
|
}
|
9953
10254
|
|
9954
10255
|
static int iq2_compare_func(const void * left, const void * right) {
|
@@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10214
10515
|
|
10215
10516
|
const int kmap_size = 43692;
|
10216
10517
|
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
10217
|
-
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10518
|
+
const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
10218
10519
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
10219
10520
|
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
10220
|
-
type == GGML_TYPE_IQ1_S
|
10521
|
+
type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
|
10221
10522
|
uint64_t * kgrid_q2xs;
|
10222
10523
|
int * kmap_q2xs;
|
10223
10524
|
uint16_t * kneighbors_q2xs;
|
@@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
10314
10615
|
}
|
10315
10616
|
|
10316
10617
|
void iq2xs_free_impl(enum ggml_type type) {
|
10317
|
-
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
10618
|
+
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
|
10318
10619
|
const int gindex = iq2_data_index(type);
|
10319
10620
|
if (iq2_data[gindex].grid) {
|
10320
10621
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
@@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
|
11520
11821
|
}
|
11521
11822
|
|
11522
11823
|
#define IQ1S_BLOCK_SIZE 32
|
11523
|
-
|
11824
|
+
#define IQ1M_BLOCK_SIZE 16
|
11825
|
+
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
|
11826
|
+
float * scales,
|
11827
|
+
float * weight,
|
11828
|
+
float * sumx,
|
11829
|
+
float * sumw,
|
11830
|
+
float * pairs,
|
11831
|
+
int8_t * L,
|
11832
|
+
uint16_t * index,
|
11833
|
+
int8_t * shifts) {
|
11524
11834
|
|
11525
11835
|
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
11526
11836
|
|
@@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11534
11844
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
11535
11845
|
GGML_ASSERT(n%QK_K == 0);
|
11536
11846
|
|
11847
|
+
block_iq1_s * y = vy;
|
11848
|
+
|
11537
11849
|
const int nbl = n/QK_K;
|
11538
11850
|
|
11539
|
-
|
11851
|
+
const int block_size = IQ1S_BLOCK_SIZE;
|
11540
11852
|
|
11541
11853
|
const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
|
11542
11854
|
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
|
11543
11855
|
|
11544
|
-
|
11545
|
-
float weight[IQ1S_BLOCK_SIZE];
|
11546
|
-
int8_t L[IQ1S_BLOCK_SIZE];
|
11547
|
-
float sumx[IQ1S_BLOCK_SIZE+1];
|
11548
|
-
float sumw[IQ1S_BLOCK_SIZE+1];
|
11549
|
-
float pairs[2*IQ1S_BLOCK_SIZE];
|
11856
|
+
|
11550
11857
|
int * idx = (int *)(pairs + 1);
|
11551
|
-
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11552
|
-
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
11553
11858
|
|
11554
11859
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
11555
11860
|
|
@@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11564
11869
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
11565
11870
|
float sigma2 = 2*sumx2/QK_K;
|
11566
11871
|
|
11567
|
-
for (int ib = 0; ib < QK_K/
|
11568
|
-
const float * xb = xbl +
|
11569
|
-
const float * qw = quant_weights + QK_K*ibl +
|
11570
|
-
for (int i = 0; i <
|
11872
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11873
|
+
const float * xb = xbl + block_size*ib;
|
11874
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
11875
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
11571
11876
|
float max = fabsf(xb[0]);
|
11572
|
-
for (int i = 1; i <
|
11877
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
11573
11878
|
if (!max) {
|
11574
11879
|
scales[ib] = 0;
|
11575
|
-
memset(L, 1,
|
11880
|
+
memset(L, 1, block_size);
|
11576
11881
|
continue;
|
11577
11882
|
}
|
11578
11883
|
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
@@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11581
11886
|
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
11582
11887
|
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
11583
11888
|
// for each possible and score for each split.
|
11584
|
-
for (int j = 0; j <
|
11889
|
+
for (int j = 0; j < block_size; ++j) {
|
11585
11890
|
pairs[2*j] = xb[j];
|
11586
11891
|
idx[2*j] = j;
|
11587
11892
|
}
|
11588
|
-
qsort(pairs,
|
11893
|
+
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
11589
11894
|
{
|
11590
11895
|
sumx[0] = sumw[0] = 0;
|
11591
|
-
for (int j = 0; j <
|
11896
|
+
for (int j = 0; j < block_size; ++j) {
|
11592
11897
|
int i = idx[2*j];
|
11593
11898
|
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
11594
11899
|
sumw[j+1] = sumw[j] + weight[i];
|
@@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11596
11901
|
}
|
11597
11902
|
float best_score = 0, scale = max;
|
11598
11903
|
int besti1 = -1, besti2 = -1, best_shift = 0;
|
11599
|
-
for (int i1 = 0; i1 <=
|
11600
|
-
for (int i2 = i1; i2 <=
|
11601
|
-
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[
|
11602
|
-
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[
|
11904
|
+
for (int i1 = 0; i1 <= block_size; ++i1) {
|
11905
|
+
for (int i2 = i1; i2 <= block_size; ++i2) {
|
11906
|
+
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
|
11907
|
+
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
|
11603
11908
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11604
11909
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11605
11910
|
besti1 = i1; besti2 = i2; best_shift = 1;
|
11606
11911
|
}
|
11607
|
-
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[
|
11608
|
-
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[
|
11912
|
+
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
|
11913
|
+
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
|
11609
11914
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
11610
11915
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
11611
11916
|
besti1 = i1; besti2 = i2; best_shift = -1;
|
@@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11615
11920
|
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
|
11616
11921
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
11617
11922
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
11618
|
-
for (int j = besti2; j <
|
11923
|
+
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
11619
11924
|
if (scale < 0) {
|
11620
|
-
for (int j = 0; j <
|
11925
|
+
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
|
11621
11926
|
scale = -scale; best_shift = -best_shift;
|
11622
11927
|
}
|
11623
11928
|
bool all_on_grid = true;
|
11624
11929
|
const float * xx = best_shift == 1 ? x_p : x_m;
|
11625
|
-
for (int k = 0; k <
|
11930
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11626
11931
|
uint16_t u = 0;
|
11627
11932
|
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
11628
11933
|
int grid_index = kmap_q2xs[u];
|
@@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11636
11941
|
}
|
11637
11942
|
if (!all_on_grid) {
|
11638
11943
|
float sumqx = 0, sumq2 = 0;
|
11639
|
-
for (int k = 0; k <
|
11944
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11640
11945
|
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
11641
11946
|
for (int j = 0; j < 8; ++j) {
|
11642
11947
|
float w = weight[8*k + j];
|
@@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11648
11953
|
if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
|
11649
11954
|
}
|
11650
11955
|
uint16_t h = 0;
|
11651
|
-
for (int k = 0; k <
|
11652
|
-
y[ibl].qs[(
|
11956
|
+
for (int k = 0; k < block_size/8; ++k) {
|
11957
|
+
y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
|
11653
11958
|
h |= (index[k] >> 8) << 3*k;
|
11654
11959
|
}
|
11655
11960
|
y[ibl].qh[ib] = h;
|
@@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11660
11965
|
}
|
11661
11966
|
|
11662
11967
|
if (!max_scale) {
|
11663
|
-
memset(y[ibl].qs, 0, QK_K/8);
|
11664
11968
|
continue;
|
11665
11969
|
}
|
11666
11970
|
|
11667
11971
|
float d = max_scale/15;
|
11668
|
-
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.
|
11972
|
+
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
|
11669
11973
|
float id = 1/d;
|
11670
|
-
for (int ib = 0; ib < QK_K/
|
11974
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
11671
11975
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
11672
11976
|
l = MAX(0, MIN(7, l));
|
11673
11977
|
if (shifts[ib] == -1) l |= 8;
|
@@ -11678,16 +11982,307 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
11678
11982
|
|
11679
11983
|
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
11680
11984
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
11985
|
+
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
11986
|
+
float weight[IQ1S_BLOCK_SIZE];
|
11987
|
+
int8_t L[IQ1S_BLOCK_SIZE];
|
11988
|
+
float sumx[IQ1S_BLOCK_SIZE+1];
|
11989
|
+
float sumw[IQ1S_BLOCK_SIZE+1];
|
11990
|
+
float pairs[2*IQ1S_BLOCK_SIZE];
|
11991
|
+
uint16_t index[IQ1S_BLOCK_SIZE/8];
|
11992
|
+
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
|
11681
11993
|
int nblock = n_per_row/QK_K;
|
11682
11994
|
char * qrow = (char *)dst;
|
11683
11995
|
for (int row = 0; row < nrow; ++row) {
|
11684
|
-
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
|
11996
|
+
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
|
11685
11997
|
src += n_per_row;
|
11686
11998
|
qrow += nblock*sizeof(block_iq1_s);
|
11687
11999
|
}
|
11688
12000
|
return nrow * nblock * sizeof(block_iq1_s);
|
11689
12001
|
}
|
11690
12002
|
|
12003
|
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
|
12004
|
+
float * scales,
|
12005
|
+
float * weight,
|
12006
|
+
float * pairs,
|
12007
|
+
int8_t * L,
|
12008
|
+
uint16_t * index,
|
12009
|
+
int8_t * shifts) {
|
12010
|
+
|
12011
|
+
const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
|
12012
|
+
|
12013
|
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
12014
|
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
12015
|
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
12016
|
+
|
12017
|
+
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
12018
|
+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
12019
|
+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
12020
|
+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
12021
|
+
GGML_ASSERT(n%QK_K == 0);
|
12022
|
+
|
12023
|
+
block_iq1_m * y = vy;
|
12024
|
+
|
12025
|
+
const int nbl = n/QK_K;
|
12026
|
+
|
12027
|
+
const int block_size = IQ1M_BLOCK_SIZE;
|
12028
|
+
|
12029
|
+
const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
|
12030
|
+
const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
|
12031
|
+
const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
|
12032
|
+
|
12033
|
+
int * idx = (int *)(pairs + 1);
|
12034
|
+
|
12035
|
+
float sumqx[4], sumq2[4];
|
12036
|
+
|
12037
|
+
iq1m_scale_t s;
|
12038
|
+
const float * xx;
|
12039
|
+
|
12040
|
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
12041
|
+
|
12042
|
+
#if QK_K == 64
|
12043
|
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
12044
|
+
#endif
|
12045
|
+
memset(y[ibl].qs, 0, QK_K/8);
|
12046
|
+
memset(y[ibl].qh, 0, QK_K/16);
|
12047
|
+
memset(y[ibl].scales, 0, QK_K/32);
|
12048
|
+
|
12049
|
+
float max_scale = 0;
|
12050
|
+
|
12051
|
+
const float * xbl = x + QK_K*ibl;
|
12052
|
+
float sumx2 = 0;
|
12053
|
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
12054
|
+
float sigma2 = 2*sumx2/QK_K;
|
12055
|
+
|
12056
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
12057
|
+
const float * xb = xbl + block_size*ib;
|
12058
|
+
if (quant_weights) {
|
12059
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
12060
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12061
|
+
} else {
|
12062
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
12063
|
+
}
|
12064
|
+
float max = fabsf(xb[0]);
|
12065
|
+
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
12066
|
+
if (!max) {
|
12067
|
+
scales[ib] = 0;
|
12068
|
+
memset(L, 1, block_size);
|
12069
|
+
continue;
|
12070
|
+
}
|
12071
|
+
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
12072
|
+
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
|
12073
|
+
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
|
12074
|
+
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
12075
|
+
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
12076
|
+
// for each possible and score for each split.
|
12077
|
+
for (int j = 0; j < block_size; ++j) {
|
12078
|
+
pairs[2*j] = xb[j];
|
12079
|
+
idx[2*j] = j;
|
12080
|
+
}
|
12081
|
+
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
12082
|
+
float best_score = 0, scale = max;
|
12083
|
+
int besti1 = -1, besti2 = -1, best_k = -1;
|
12084
|
+
// 0: +, +
|
12085
|
+
// 1: +, -
|
12086
|
+
// 2: -, +
|
12087
|
+
// 3: -, -
|
12088
|
+
for (int i1 = 0; i1 <= block_size; ++i1) {
|
12089
|
+
for (int i2 = i1; i2 <= block_size; ++i2) {
|
12090
|
+
memset(sumqx, 0, 4*sizeof(float));
|
12091
|
+
memset(sumq2, 0, 4*sizeof(float));
|
12092
|
+
for (int j = 0; j < i1; ++j) {
|
12093
|
+
int i = idx[2*j];
|
12094
|
+
if (i < block_size/2) {
|
12095
|
+
sumqx[0] += weight[i]*x_p[0]*xb[i];
|
12096
|
+
sumqx[1] += weight[i]*x_p[0]*xb[i];
|
12097
|
+
sumqx[2] += weight[i]*x_m[0]*xb[i];
|
12098
|
+
sumqx[3] += weight[i]*x_m[0]*xb[i];
|
12099
|
+
sumq2[0] += weight[i]*x_p[0]*x_p[0];
|
12100
|
+
sumq2[1] += weight[i]*x_p[0]*x_p[0];
|
12101
|
+
sumq2[2] += weight[i]*x_m[0]*x_m[0];
|
12102
|
+
sumq2[3] += weight[i]*x_m[0]*x_m[0];
|
12103
|
+
} else {
|
12104
|
+
sumqx[0] += weight[i]*x_p[0]*xb[i];
|
12105
|
+
sumqx[2] += weight[i]*x_p[0]*xb[i];
|
12106
|
+
sumqx[1] += weight[i]*x_m[0]*xb[i];
|
12107
|
+
sumqx[3] += weight[i]*x_m[0]*xb[i];
|
12108
|
+
sumq2[0] += weight[i]*x_p[0]*x_p[0];
|
12109
|
+
sumq2[2] += weight[i]*x_p[0]*x_p[0];
|
12110
|
+
sumq2[1] += weight[i]*x_m[0]*x_m[0];
|
12111
|
+
sumq2[3] += weight[i]*x_m[0]*x_m[0];
|
12112
|
+
}
|
12113
|
+
}
|
12114
|
+
for (int j = i1; j < i2; ++j) {
|
12115
|
+
int i = idx[2*j];
|
12116
|
+
if (i < block_size/2) {
|
12117
|
+
sumqx[0] += weight[i]*x_p[1]*xb[i];
|
12118
|
+
sumqx[1] += weight[i]*x_p[1]*xb[i];
|
12119
|
+
sumqx[2] += weight[i]*x_m[1]*xb[i];
|
12120
|
+
sumqx[3] += weight[i]*x_m[1]*xb[i];
|
12121
|
+
sumq2[0] += weight[i]*x_p[1]*x_p[1];
|
12122
|
+
sumq2[1] += weight[i]*x_p[1]*x_p[1];
|
12123
|
+
sumq2[2] += weight[i]*x_m[1]*x_m[1];
|
12124
|
+
sumq2[3] += weight[i]*x_m[1]*x_m[1];
|
12125
|
+
} else {
|
12126
|
+
sumqx[0] += weight[i]*x_p[1]*xb[i];
|
12127
|
+
sumqx[2] += weight[i]*x_p[1]*xb[i];
|
12128
|
+
sumqx[1] += weight[i]*x_m[1]*xb[i];
|
12129
|
+
sumqx[3] += weight[i]*x_m[1]*xb[i];
|
12130
|
+
sumq2[0] += weight[i]*x_p[1]*x_p[1];
|
12131
|
+
sumq2[2] += weight[i]*x_p[1]*x_p[1];
|
12132
|
+
sumq2[1] += weight[i]*x_m[1]*x_m[1];
|
12133
|
+
sumq2[3] += weight[i]*x_m[1]*x_m[1];
|
12134
|
+
}
|
12135
|
+
}
|
12136
|
+
for (int j = i2; j < block_size; ++j) {
|
12137
|
+
int i = idx[2*j];
|
12138
|
+
if (i < block_size/2) {
|
12139
|
+
sumqx[0] += weight[i]*x_p[2]*xb[i];
|
12140
|
+
sumqx[1] += weight[i]*x_p[2]*xb[i];
|
12141
|
+
sumqx[2] += weight[i]*x_m[2]*xb[i];
|
12142
|
+
sumqx[3] += weight[i]*x_m[2]*xb[i];
|
12143
|
+
sumq2[0] += weight[i]*x_p[2]*x_p[2];
|
12144
|
+
sumq2[1] += weight[i]*x_p[2]*x_p[2];
|
12145
|
+
sumq2[2] += weight[i]*x_m[2]*x_m[2];
|
12146
|
+
sumq2[3] += weight[i]*x_m[2]*x_m[2];
|
12147
|
+
} else {
|
12148
|
+
sumqx[0] += weight[i]*x_p[2]*xb[i];
|
12149
|
+
sumqx[2] += weight[i]*x_p[2]*xb[i];
|
12150
|
+
sumqx[1] += weight[i]*x_m[2]*xb[i];
|
12151
|
+
sumqx[3] += weight[i]*x_m[2]*xb[i];
|
12152
|
+
sumq2[0] += weight[i]*x_p[2]*x_p[2];
|
12153
|
+
sumq2[2] += weight[i]*x_p[2]*x_p[2];
|
12154
|
+
sumq2[1] += weight[i]*x_m[2]*x_m[2];
|
12155
|
+
sumq2[3] += weight[i]*x_m[2]*x_m[2];
|
12156
|
+
}
|
12157
|
+
}
|
12158
|
+
for (int k = 0; k < 4; ++k) {
|
12159
|
+
if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
|
12160
|
+
scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
|
12161
|
+
besti1 = i1; besti2 = i2; best_k = k;
|
12162
|
+
}
|
12163
|
+
}
|
12164
|
+
}
|
12165
|
+
}
|
12166
|
+
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
|
12167
|
+
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
12168
|
+
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
12169
|
+
for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
|
12170
|
+
if (scale < 0) {
|
12171
|
+
for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
|
12172
|
+
scale = -scale;
|
12173
|
+
best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
|
12174
|
+
}
|
12175
|
+
bool all_on_grid = true;
|
12176
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12177
|
+
if (k == 0) xx = best_k < 2 ? x_p : x_m;
|
12178
|
+
else xx = best_k%2 == 0 ? x_p : x_m;
|
12179
|
+
uint16_t u = 0;
|
12180
|
+
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
|
12181
|
+
int grid_index = kmap_q2xs[u];
|
12182
|
+
if (grid_index < 0) {
|
12183
|
+
all_on_grid = false;
|
12184
|
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
12185
|
+
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
|
12186
|
+
GGML_ASSERT(grid_index >= 0);
|
12187
|
+
}
|
12188
|
+
index[k] = grid_index;
|
12189
|
+
}
|
12190
|
+
if (!all_on_grid) {
|
12191
|
+
float sumqx_f = 0, sumq2_f = 0;
|
12192
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12193
|
+
if (k == 0) xx = best_k < 2 ? x_p : x_m;
|
12194
|
+
else xx = best_k%2 == 0 ? x_p : x_m;
|
12195
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
|
12196
|
+
for (int j = 0; j < 8; ++j) {
|
12197
|
+
float w = weight[8*k + j];
|
12198
|
+
float q = xx[(pg[j] - 1)/2];
|
12199
|
+
sumqx_f += w*q*xb[8*k+j];
|
12200
|
+
sumq2_f += w*q*q;
|
12201
|
+
}
|
12202
|
+
}
|
12203
|
+
if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
|
12204
|
+
}
|
12205
|
+
y[ibl].qs[2*ib + 0] = index[0] & 255;
|
12206
|
+
y[ibl].qs[2*ib + 1] = index[1] & 255;
|
12207
|
+
y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
|
12208
|
+
GGML_ASSERT(scale >= 0);
|
12209
|
+
scales[ib] = scale;
|
12210
|
+
shifts[ib] = best_k;
|
12211
|
+
max_scale = MAX(max_scale, scale);
|
12212
|
+
}
|
12213
|
+
|
12214
|
+
if (!max_scale) {
|
12215
|
+
continue;
|
12216
|
+
}
|
12217
|
+
|
12218
|
+
uint16_t * sc = (uint16_t *)y[ibl].scales;
|
12219
|
+
#if QK_K == 64
|
12220
|
+
float d = max_scale/31;
|
12221
|
+
#else
|
12222
|
+
float d = max_scale/15;
|
12223
|
+
#endif
|
12224
|
+
float id = 1/d;
|
12225
|
+
float sumqx_f = 0, sumq2_f = 0;
|
12226
|
+
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
12227
|
+
int l = nearest_int(0.5f*(id*scales[ib+0]-1));
|
12228
|
+
#if QK_K == 64
|
12229
|
+
l = MAX(0, MIN(15, l));
|
12230
|
+
sc[ib/4] |= (l << 4*(ib%4));
|
12231
|
+
#else
|
12232
|
+
l = MAX(0, MIN(7, l));
|
12233
|
+
sc[ib/4] |= (l << 3*(ib%4));
|
12234
|
+
#endif
|
12235
|
+
y[ibl].qh[ib] |= masks[shifts[ib]];
|
12236
|
+
const float * xb = xbl + block_size*ib;
|
12237
|
+
if (quant_weights) {
|
12238
|
+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
12239
|
+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
12240
|
+
} else {
|
12241
|
+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
12242
|
+
}
|
12243
|
+
for (int k = 0; k < block_size/8; ++k) {
|
12244
|
+
if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
|
12245
|
+
else xx = shifts[ib]%2 == 0 ? x_p : x_m;
|
12246
|
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
|
12247
|
+
for (int j = 0; j < 8; ++j) {
|
12248
|
+
float w = weight[8*k + j];
|
12249
|
+
float q = xx[(pg[j] - 1)/2]*(2*l+1);
|
12250
|
+
sumqx_f += w*q*xb[8*k+j];
|
12251
|
+
sumq2_f += w*q*q;
|
12252
|
+
}
|
12253
|
+
}
|
12254
|
+
}
|
12255
|
+
if (sumq2_f > 0) d = sumqx_f/sumq2_f;
|
12256
|
+
s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
|
12257
|
+
#if QK_K == 64
|
12258
|
+
y[ibl].d = s.f16;
|
12259
|
+
#else
|
12260
|
+
sc[0] |= ((s.u16 & 0x000f) << 12);
|
12261
|
+
sc[1] |= ((s.u16 & 0x00f0) << 8);
|
12262
|
+
sc[2] |= ((s.u16 & 0x0f00) << 4);
|
12263
|
+
sc[3] |= ((s.u16 & 0xf000) << 0);
|
12264
|
+
#endif
|
12265
|
+
}
|
12266
|
+
}
|
12267
|
+
|
12268
|
+
size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
12269
|
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
12270
|
+
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
12271
|
+
float weight[IQ1M_BLOCK_SIZE];
|
12272
|
+
int8_t L[IQ1M_BLOCK_SIZE];
|
12273
|
+
float pairs[2*IQ1M_BLOCK_SIZE];
|
12274
|
+
uint16_t index[IQ1M_BLOCK_SIZE/8];
|
12275
|
+
int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
|
12276
|
+
int nblock = n_per_row/QK_K;
|
12277
|
+
char * qrow = (char *)dst;
|
12278
|
+
for (int row = 0; row < nrow; ++row) {
|
12279
|
+
quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
|
12280
|
+
src += n_per_row;
|
12281
|
+
qrow += nblock*sizeof(block_iq1_m);
|
12282
|
+
}
|
12283
|
+
return nrow * nblock * sizeof(block_iq1_m);
|
12284
|
+
}
|
12285
|
+
|
11691
12286
|
// ============================ 4-bit non-linear quants
|
11692
12287
|
|
11693
12288
|
static inline int best_index_int8(int n, const int8_t * val, float x) {
|