llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
132
132
  }
133
133
 
134
134
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
135
- #if __AVXVNNI__
135
+ #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
136
136
  const __m256i zero = _mm256_setzero_si256();
137
137
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
138
138
  return _mm256_cvtepi32_ps(summed_pairs);
@@ -3474,6 +3474,65 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
3474
3474
  }
3475
3475
  }
3476
3476
 
3477
+ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
3478
+ assert(k % QK_K == 0);
3479
+ const int nb = k / QK_K;
3480
+
3481
+ float delta[4];
3482
+ uint16_t idx[4];
3483
+
3484
+ #if QK_K != 64
3485
+ iq1m_scale_t scale;
3486
+ #endif
3487
+
3488
+ for (int i = 0; i < nb; i++) {
3489
+
3490
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
3491
+ #if QK_K == 64
3492
+ const float d = GGML_FP16_TO_FP32(x[i].d);
3493
+ #else
3494
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3495
+ const float d = GGML_FP16_TO_FP32(scale.f16);
3496
+ #endif
3497
+ const uint8_t * qs = x[i].qs;
3498
+ const uint8_t * qh = x[i].qh;
3499
+
3500
+ for (int ib = 0; ib < QK_K/32; ++ib) {
3501
+ #if QK_K == 64
3502
+ const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
3503
+ const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
3504
+ #else
3505
+ const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
3506
+ const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
3507
+ #endif
3508
+ idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
3509
+ idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
3510
+ idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
3511
+ idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
3512
+ delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
3513
+ delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
3514
+ delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
3515
+ delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
3516
+ for (int l = 0; l < 2; ++l) {
3517
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3518
+ for (int j = 0; j < 8; ++j) {
3519
+ y[j] = dl1 * (grid[j] + delta[l]);
3520
+ }
3521
+ y += 8;
3522
+ }
3523
+ for (int l = 2; l < 4; ++l) {
3524
+ const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
3525
+ for (int j = 0; j < 8; ++j) {
3526
+ y[j] = dl2 * (grid[j] + delta[l]);
3527
+ }
3528
+ y += 8;
3529
+ }
3530
+ qs += 4;
3531
+ qh += 2;
3532
+ }
3533
+ }
3534
+ }
3535
+
3477
3536
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3478
3537
 
3479
3538
  void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
@@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9695
9754
  #endif
9696
9755
  }
9697
9756
 
9757
+ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9758
+ assert(n % QK_K == 0);
9759
+ assert(nrc == 1);
9760
+ UNUSED(nrc);
9761
+ UNUSED(bx);
9762
+ UNUSED(by);
9763
+ UNUSED(bs);
9764
+
9765
+ const block_iq1_m * restrict x = vx;
9766
+ const block_q8_K * restrict y = vy;
9767
+
9768
+ const int nb = n / QK_K;
9769
+
9770
+ #if QK_K != 64
9771
+ iq1m_scale_t scale;
9772
+ #endif
9773
+
9774
+ #if defined __ARM_NEON
9775
+
9776
+ #if QK_K == 64
9777
+ const int32x4_t mask = vdupq_n_s32(0xf);
9778
+ #else
9779
+ const int32x4_t mask = vdupq_n_s32(0x7);
9780
+ #endif
9781
+ const int32x4_t mone = vdupq_n_s32(1);
9782
+ const int32x4_t mzero = vdupq_n_s32(0);
9783
+
9784
+ ggml_int8x16x4_t deltas;
9785
+ deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
9786
+ deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
9787
+ deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
9788
+ deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
9789
+
9790
+ ggml_int8x16x4_t q1b;
9791
+ ggml_int8x16x4_t q8b;
9792
+
9793
+ uint32_t aux32;
9794
+ const uint8_t * aux8 = (const uint8_t *)&aux32;
9795
+
9796
+ float sumf = 0;
9797
+ for (int i = 0; i < nb; ++i) {
9798
+
9799
+ const int8_t * q8 = y[i].qs;
9800
+ const uint8_t * qs = x[i].qs;
9801
+ const uint8_t * qh = x[i].qh;
9802
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9803
+
9804
+ #if QK_K != 64
9805
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9806
+ #endif
9807
+
9808
+ int32x4_t sumi1 = mzero;
9809
+ int32x4_t sumi2 = mzero;
9810
+
9811
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9812
+
9813
+ q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
9814
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
9815
+ q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
9816
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
9817
+ q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
9818
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
9819
+ q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
9820
+ vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
9821
+
9822
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
9823
+
9824
+ const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
9825
+ const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
9826
+ const int32x4_t p12 = vpaddq_s32(p1, p2);
9827
+
9828
+ const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
9829
+ aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
9830
+
9831
+ const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
9832
+ const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
9833
+ const int32x4_t p34 = vpaddq_s32(p3, p4);
9834
+
9835
+ #if QK_K == 64
9836
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
9837
+ #else
9838
+ int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
9839
+ #endif
9840
+ scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
9841
+
9842
+ sumi1 = vmlaq_s32(sumi1, scales_4, p12);
9843
+ sumi2 = vmlaq_s32(sumi2, scales_4, p34);
9844
+
9845
+ qs += 8; qh += 4;
9846
+
9847
+ }
9848
+
9849
+ #if QK_K == 64
9850
+ sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
9851
+ #else
9852
+ sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
9853
+ #endif
9854
+ }
9855
+
9856
+ *s = sumf;
9857
+
9858
+ #elif defined __AVX2__
9859
+
9860
+ #if QK_K == 64
9861
+ const __m256i mask = _mm256_set1_epi16(0xf);
9862
+ #else
9863
+ const __m256i mask = _mm256_set1_epi16(0x7);
9864
+ #endif
9865
+ const __m256i mone = _mm256_set1_epi16(1);
9866
+
9867
+ __m256 accum1 = _mm256_setzero_ps();
9868
+ __m256 accum2 = _mm256_setzero_ps();
9869
+ for (int i = 0; i < nb; ++i) {
9870
+
9871
+ const int8_t * q8 = y[i].qs;
9872
+ const uint8_t * qs = x[i].qs;
9873
+ const uint8_t * qh = x[i].qh;
9874
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9875
+
9876
+ #if QK_K != 64
9877
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9878
+ #endif
9879
+
9880
+ __m256i sumi1 = _mm256_setzero_si256();
9881
+ __m256i sumi2 = _mm256_setzero_si256();
9882
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
9883
+ const __m256i q1b_1 = _mm256_set_epi64x(
9884
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
9885
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
9886
+ );
9887
+ const __m256i q1b_2 = _mm256_set_epi64x(
9888
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
9889
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
9890
+ );
9891
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9892
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
9893
+
9894
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
9895
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
9896
+
9897
+ const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9898
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
9899
+ qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9900
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
9901
+ const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9902
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
9903
+ qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
9904
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
9905
+
9906
+ const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
9907
+ const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
9908
+ #if QK_K == 64
9909
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0));
9910
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
9911
+ #else
9912
+ __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
9913
+ __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
9914
+ #endif
9915
+ scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
9916
+ scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
9917
+ const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
9918
+ const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
9919
+ const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
9920
+ const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
9921
+
9922
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
9923
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
9924
+
9925
+ qs += 8; qh += 4;
9926
+ }
9927
+
9928
+ #if QK_K == 64
9929
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
9930
+ #else
9931
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
9932
+ #endif
9933
+ accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
9934
+ accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
9935
+
9936
+ }
9937
+
9938
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
9939
+
9940
+ #else
9941
+
9942
+ int sum1[2], sum2[2], delta[4];
9943
+
9944
+ float sumf = 0;
9945
+ for (int i = 0; i < nb; i++) {
9946
+
9947
+ const int8_t * q8 = y[i].qs;
9948
+ const uint8_t * qs = x[i].qs;
9949
+ const uint8_t * qh = x[i].qh;
9950
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
9951
+
9952
+ #if QK_K != 64
9953
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
9954
+ #endif
9955
+
9956
+ int sumi1 = 0, sumi2 = 0;
9957
+ for (int ib = 0; ib < QK_K/32; ++ib) {
9958
+ delta[0] = qh[0] & 0x08 ? -1 : 1;
9959
+ delta[1] = qh[0] & 0x80 ? -1 : 1;
9960
+ delta[2] = qh[1] & 0x08 ? -1 : 1;
9961
+ delta[3] = qh[1] & 0x80 ? -1 : 1;
9962
+ sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
9963
+ for (int l = 0; l < 4; ++l) {
9964
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
9965
+ int lsum1 = 0, lsum2 = 0;
9966
+ for (int j = 0; j < 8; ++j) {
9967
+ lsum1 += q8[j] * grid[j];
9968
+ lsum2 += q8[j];
9969
+ }
9970
+ q8 += 8;
9971
+ sum1[l/2] += lsum1;
9972
+ sum2[l/2] += lsum2*delta[l];
9973
+ }
9974
+ #if QK_K == 64
9975
+ const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
9976
+ const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
9977
+ #else
9978
+ const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
9979
+ const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
9980
+ #endif
9981
+ sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
9982
+ sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
9983
+ qs += 4;
9984
+ qh += 2;
9985
+ }
9986
+
9987
+ #if QK_K == 64
9988
+ sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
9989
+ #else
9990
+ sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
9991
+ #endif
9992
+ }
9993
+
9994
+ *s = sumf;
9995
+
9996
+ #endif
9997
+ }
9998
+
9698
9999
  void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9699
10000
  assert(nrc == 1);
9700
10001
  UNUSED(nrc);
@@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = {
9938
10239
  };
9939
10240
 
9940
10241
  static inline int iq2_data_index(enum ggml_type type) {
9941
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10242
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
9942
10243
  return type == GGML_TYPE_IQ2_XXS ? 0 :
9943
10244
  type == GGML_TYPE_IQ2_XS ? 1 :
9944
- type == GGML_TYPE_IQ1_S ? 2 : 3;
10245
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
9945
10246
  }
9946
10247
 
9947
10248
  static inline int iq2_grid_size(enum ggml_type type) {
9948
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10249
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
9949
10250
  return type == GGML_TYPE_IQ2_XXS ? 256 :
9950
10251
  type == GGML_TYPE_IQ2_XS ? 512 :
9951
- type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024;
10252
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
9952
10253
  }
9953
10254
 
9954
10255
  static int iq2_compare_func(const void * left, const void * right) {
@@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) {
10214
10515
 
10215
10516
  const int kmap_size = 43692;
10216
10517
  //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
10217
- const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10518
+ const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
10218
10519
  const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
10219
10520
  type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
10220
- type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024;
10521
+ type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
10221
10522
  uint64_t * kgrid_q2xs;
10222
10523
  int * kmap_q2xs;
10223
10524
  uint16_t * kneighbors_q2xs;
@@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10314
10615
  }
10315
10616
 
10316
10617
  void iq2xs_free_impl(enum ggml_type type) {
10317
- GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
10618
+ GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
10318
10619
  const int gindex = iq2_data_index(type);
10319
10620
  if (iq2_data[gindex].grid) {
10320
10621
  free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
@@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) {
11520
11821
  }
11521
11822
 
11522
11823
  #define IQ1S_BLOCK_SIZE 32
11523
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
11824
+ #define IQ1M_BLOCK_SIZE 16
11825
+ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
11826
+ float * scales,
11827
+ float * weight,
11828
+ float * sumx,
11829
+ float * sumw,
11830
+ float * pairs,
11831
+ int8_t * L,
11832
+ uint16_t * index,
11833
+ int8_t * shifts) {
11524
11834
 
11525
11835
  const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
11526
11836
 
@@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11534
11844
  GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
11535
11845
  GGML_ASSERT(n%QK_K == 0);
11536
11846
 
11847
+ block_iq1_s * y = vy;
11848
+
11537
11849
  const int nbl = n/QK_K;
11538
11850
 
11539
- block_iq1_s * y = vy;
11851
+ const int block_size = IQ1S_BLOCK_SIZE;
11540
11852
 
11541
11853
  const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
11542
11854
  const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
11543
11855
 
11544
- float scales[QK_K/IQ1S_BLOCK_SIZE];
11545
- float weight[IQ1S_BLOCK_SIZE];
11546
- int8_t L[IQ1S_BLOCK_SIZE];
11547
- float sumx[IQ1S_BLOCK_SIZE+1];
11548
- float sumw[IQ1S_BLOCK_SIZE+1];
11549
- float pairs[2*IQ1S_BLOCK_SIZE];
11856
+
11550
11857
  int * idx = (int *)(pairs + 1);
11551
- uint16_t index[IQ1S_BLOCK_SIZE/8];
11552
- int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11553
11858
 
11554
11859
  for (int ibl = 0; ibl < nbl; ++ibl) {
11555
11860
 
@@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11564
11869
  for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
11565
11870
  float sigma2 = 2*sumx2/QK_K;
11566
11871
 
11567
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11568
- const float * xb = xbl + IQ1S_BLOCK_SIZE*ib;
11569
- const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib;
11570
- for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11872
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11873
+ const float * xb = xbl + block_size*ib;
11874
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
11875
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
11571
11876
  float max = fabsf(xb[0]);
11572
- for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i]));
11877
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
11573
11878
  if (!max) {
11574
11879
  scales[ib] = 0;
11575
- memset(L, 1, IQ1S_BLOCK_SIZE);
11880
+ memset(L, 1, block_size);
11576
11881
  continue;
11577
11882
  }
11578
11883
  // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
@@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11581
11886
  // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
11582
11887
  // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
11583
11888
  // for each possible and score for each split.
11584
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
11889
+ for (int j = 0; j < block_size; ++j) {
11585
11890
  pairs[2*j] = xb[j];
11586
11891
  idx[2*j] = j;
11587
11892
  }
11588
- qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper);
11893
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
11589
11894
  {
11590
11895
  sumx[0] = sumw[0] = 0;
11591
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) {
11896
+ for (int j = 0; j < block_size; ++j) {
11592
11897
  int i = idx[2*j];
11593
11898
  sumx[j+1] = sumx[j] + weight[i]*xb[i];
11594
11899
  sumw[j+1] = sumw[j] + weight[i];
@@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11596
11901
  }
11597
11902
  float best_score = 0, scale = max;
11598
11903
  int besti1 = -1, besti2 = -1, best_shift = 0;
11599
- for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
11600
- for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
11601
- float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
11602
- float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
11904
+ for (int i1 = 0; i1 <= block_size; ++i1) {
11905
+ for (int i2 = i1; i2 <= block_size; ++i2) {
11906
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
11907
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
11603
11908
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11604
11909
  scale = sumqx/sumq2; best_score = scale*sumqx;
11605
11910
  besti1 = i1; besti2 = i2; best_shift = 1;
11606
11911
  }
11607
- sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
11608
- sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
11912
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
11913
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
11609
11914
  if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
11610
11915
  scale = sumqx/sumq2; best_score = scale*sumqx;
11611
11916
  besti1 = i1; besti2 = i2; best_shift = -1;
@@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11615
11920
  GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
11616
11921
  for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
11617
11922
  for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
11618
- for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
11923
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
11619
11924
  if (scale < 0) {
11620
- for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
11925
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
11621
11926
  scale = -scale; best_shift = -best_shift;
11622
11927
  }
11623
11928
  bool all_on_grid = true;
11624
11929
  const float * xx = best_shift == 1 ? x_p : x_m;
11625
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11930
+ for (int k = 0; k < block_size/8; ++k) {
11626
11931
  uint16_t u = 0;
11627
11932
  for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
11628
11933
  int grid_index = kmap_q2xs[u];
@@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11636
11941
  }
11637
11942
  if (!all_on_grid) {
11638
11943
  float sumqx = 0, sumq2 = 0;
11639
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11944
+ for (int k = 0; k < block_size/8; ++k) {
11640
11945
  const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
11641
11946
  for (int j = 0; j < 8; ++j) {
11642
11947
  float w = weight[8*k + j];
@@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11648
11953
  if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
11649
11954
  }
11650
11955
  uint16_t h = 0;
11651
- for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
11652
- y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255;
11956
+ for (int k = 0; k < block_size/8; ++k) {
11957
+ y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
11653
11958
  h |= (index[k] >> 8) << 3*k;
11654
11959
  }
11655
11960
  y[ibl].qh[ib] = h;
@@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11660
11965
  }
11661
11966
 
11662
11967
  if (!max_scale) {
11663
- memset(y[ibl].qs, 0, QK_K/8);
11664
11968
  continue;
11665
11969
  }
11666
11970
 
11667
11971
  float d = max_scale/15;
11668
- y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
11972
+ y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
11669
11973
  float id = 1/d;
11670
- for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
11974
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
11671
11975
  int l = nearest_int(0.5f*(id*scales[ib]-1));
11672
11976
  l = MAX(0, MIN(7, l));
11673
11977
  if (shifts[ib] == -1) l |= 8;
@@ -11678,16 +11982,307 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11678
11982
 
11679
11983
  size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11680
11984
  GGML_ASSERT(n_per_row%QK_K == 0);
11985
+ float scales[QK_K/IQ1S_BLOCK_SIZE];
11986
+ float weight[IQ1S_BLOCK_SIZE];
11987
+ int8_t L[IQ1S_BLOCK_SIZE];
11988
+ float sumx[IQ1S_BLOCK_SIZE+1];
11989
+ float sumw[IQ1S_BLOCK_SIZE+1];
11990
+ float pairs[2*IQ1S_BLOCK_SIZE];
11991
+ uint16_t index[IQ1S_BLOCK_SIZE/8];
11992
+ int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
11681
11993
  int nblock = n_per_row/QK_K;
11682
11994
  char * qrow = (char *)dst;
11683
11995
  for (int row = 0; row < nrow; ++row) {
11684
- quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
11996
+ quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
11685
11997
  src += n_per_row;
11686
11998
  qrow += nblock*sizeof(block_iq1_s);
11687
11999
  }
11688
12000
  return nrow * nblock * sizeof(block_iq1_s);
11689
12001
  }
11690
12002
 
12003
+ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
12004
+ float * scales,
12005
+ float * weight,
12006
+ float * pairs,
12007
+ int8_t * L,
12008
+ uint16_t * index,
12009
+ int8_t * shifts) {
12010
+
12011
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
12012
+
12013
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
12014
+ const int * kmap_q2xs = iq2_data[gindex].map;
12015
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
12016
+
12017
+ //GGML_ASSERT(quant_weights && "missing quantization weights");
12018
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
12019
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
12020
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
12021
+ GGML_ASSERT(n%QK_K == 0);
12022
+
12023
+ block_iq1_m * y = vy;
12024
+
12025
+ const int nbl = n/QK_K;
12026
+
12027
+ const int block_size = IQ1M_BLOCK_SIZE;
12028
+
12029
+ const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
12030
+ const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
12031
+ const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
12032
+
12033
+ int * idx = (int *)(pairs + 1);
12034
+
12035
+ float sumqx[4], sumq2[4];
12036
+
12037
+ iq1m_scale_t s;
12038
+ const float * xx;
12039
+
12040
+ for (int ibl = 0; ibl < nbl; ++ibl) {
12041
+
12042
+ #if QK_K == 64
12043
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
12044
+ #endif
12045
+ memset(y[ibl].qs, 0, QK_K/8);
12046
+ memset(y[ibl].qh, 0, QK_K/16);
12047
+ memset(y[ibl].scales, 0, QK_K/32);
12048
+
12049
+ float max_scale = 0;
12050
+
12051
+ const float * xbl = x + QK_K*ibl;
12052
+ float sumx2 = 0;
12053
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
12054
+ float sigma2 = 2*sumx2/QK_K;
12055
+
12056
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
12057
+ const float * xb = xbl + block_size*ib;
12058
+ if (quant_weights) {
12059
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12060
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12061
+ } else {
12062
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12063
+ }
12064
+ float max = fabsf(xb[0]);
12065
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
12066
+ if (!max) {
12067
+ scales[ib] = 0;
12068
+ memset(L, 1, block_size);
12069
+ continue;
12070
+ }
12071
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
12072
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
12073
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
12074
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
12075
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
12076
+ // for each possible and score for each split.
12077
+ for (int j = 0; j < block_size; ++j) {
12078
+ pairs[2*j] = xb[j];
12079
+ idx[2*j] = j;
12080
+ }
12081
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
12082
+ float best_score = 0, scale = max;
12083
+ int besti1 = -1, besti2 = -1, best_k = -1;
12084
+ // 0: +, +
12085
+ // 1: +, -
12086
+ // 2: -, +
12087
+ // 3: -, -
12088
+ for (int i1 = 0; i1 <= block_size; ++i1) {
12089
+ for (int i2 = i1; i2 <= block_size; ++i2) {
12090
+ memset(sumqx, 0, 4*sizeof(float));
12091
+ memset(sumq2, 0, 4*sizeof(float));
12092
+ for (int j = 0; j < i1; ++j) {
12093
+ int i = idx[2*j];
12094
+ if (i < block_size/2) {
12095
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
12096
+ sumqx[1] += weight[i]*x_p[0]*xb[i];
12097
+ sumqx[2] += weight[i]*x_m[0]*xb[i];
12098
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
12099
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
12100
+ sumq2[1] += weight[i]*x_p[0]*x_p[0];
12101
+ sumq2[2] += weight[i]*x_m[0]*x_m[0];
12102
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
12103
+ } else {
12104
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
12105
+ sumqx[2] += weight[i]*x_p[0]*xb[i];
12106
+ sumqx[1] += weight[i]*x_m[0]*xb[i];
12107
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
12108
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
12109
+ sumq2[2] += weight[i]*x_p[0]*x_p[0];
12110
+ sumq2[1] += weight[i]*x_m[0]*x_m[0];
12111
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
12112
+ }
12113
+ }
12114
+ for (int j = i1; j < i2; ++j) {
12115
+ int i = idx[2*j];
12116
+ if (i < block_size/2) {
12117
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
12118
+ sumqx[1] += weight[i]*x_p[1]*xb[i];
12119
+ sumqx[2] += weight[i]*x_m[1]*xb[i];
12120
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
12121
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
12122
+ sumq2[1] += weight[i]*x_p[1]*x_p[1];
12123
+ sumq2[2] += weight[i]*x_m[1]*x_m[1];
12124
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
12125
+ } else {
12126
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
12127
+ sumqx[2] += weight[i]*x_p[1]*xb[i];
12128
+ sumqx[1] += weight[i]*x_m[1]*xb[i];
12129
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
12130
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
12131
+ sumq2[2] += weight[i]*x_p[1]*x_p[1];
12132
+ sumq2[1] += weight[i]*x_m[1]*x_m[1];
12133
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
12134
+ }
12135
+ }
12136
+ for (int j = i2; j < block_size; ++j) {
12137
+ int i = idx[2*j];
12138
+ if (i < block_size/2) {
12139
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
12140
+ sumqx[1] += weight[i]*x_p[2]*xb[i];
12141
+ sumqx[2] += weight[i]*x_m[2]*xb[i];
12142
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
12143
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
12144
+ sumq2[1] += weight[i]*x_p[2]*x_p[2];
12145
+ sumq2[2] += weight[i]*x_m[2]*x_m[2];
12146
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
12147
+ } else {
12148
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
12149
+ sumqx[2] += weight[i]*x_p[2]*xb[i];
12150
+ sumqx[1] += weight[i]*x_m[2]*xb[i];
12151
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
12152
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
12153
+ sumq2[2] += weight[i]*x_p[2]*x_p[2];
12154
+ sumq2[1] += weight[i]*x_m[2]*x_m[2];
12155
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
12156
+ }
12157
+ }
12158
+ for (int k = 0; k < 4; ++k) {
12159
+ if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
12160
+ scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
12161
+ besti1 = i1; besti2 = i2; best_k = k;
12162
+ }
12163
+ }
12164
+ }
12165
+ }
12166
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
12167
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
12168
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
12169
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
12170
+ if (scale < 0) {
12171
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
12172
+ scale = -scale;
12173
+ best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
12174
+ }
12175
+ bool all_on_grid = true;
12176
+ for (int k = 0; k < block_size/8; ++k) {
12177
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
12178
+ else xx = best_k%2 == 0 ? x_p : x_m;
12179
+ uint16_t u = 0;
12180
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
12181
+ int grid_index = kmap_q2xs[u];
12182
+ if (grid_index < 0) {
12183
+ all_on_grid = false;
12184
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
12185
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
12186
+ GGML_ASSERT(grid_index >= 0);
12187
+ }
12188
+ index[k] = grid_index;
12189
+ }
12190
+ if (!all_on_grid) {
12191
+ float sumqx_f = 0, sumq2_f = 0;
12192
+ for (int k = 0; k < block_size/8; ++k) {
12193
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
12194
+ else xx = best_k%2 == 0 ? x_p : x_m;
12195
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
12196
+ for (int j = 0; j < 8; ++j) {
12197
+ float w = weight[8*k + j];
12198
+ float q = xx[(pg[j] - 1)/2];
12199
+ sumqx_f += w*q*xb[8*k+j];
12200
+ sumq2_f += w*q*q;
12201
+ }
12202
+ }
12203
+ if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
12204
+ }
12205
+ y[ibl].qs[2*ib + 0] = index[0] & 255;
12206
+ y[ibl].qs[2*ib + 1] = index[1] & 255;
12207
+ y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
12208
+ GGML_ASSERT(scale >= 0);
12209
+ scales[ib] = scale;
12210
+ shifts[ib] = best_k;
12211
+ max_scale = MAX(max_scale, scale);
12212
+ }
12213
+
12214
+ if (!max_scale) {
12215
+ continue;
12216
+ }
12217
+
12218
+ uint16_t * sc = (uint16_t *)y[ibl].scales;
12219
+ #if QK_K == 64
12220
+ float d = max_scale/31;
12221
+ #else
12222
+ float d = max_scale/15;
12223
+ #endif
12224
+ float id = 1/d;
12225
+ float sumqx_f = 0, sumq2_f = 0;
12226
+ for (int ib = 0; ib < QK_K/block_size; ++ib) {
12227
+ int l = nearest_int(0.5f*(id*scales[ib+0]-1));
12228
+ #if QK_K == 64
12229
+ l = MAX(0, MIN(15, l));
12230
+ sc[ib/4] |= (l << 4*(ib%4));
12231
+ #else
12232
+ l = MAX(0, MIN(7, l));
12233
+ sc[ib/4] |= (l << 3*(ib%4));
12234
+ #endif
12235
+ y[ibl].qh[ib] |= masks[shifts[ib]];
12236
+ const float * xb = xbl + block_size*ib;
12237
+ if (quant_weights) {
12238
+ const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12239
+ for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12240
+ } else {
12241
+ for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12242
+ }
12243
+ for (int k = 0; k < block_size/8; ++k) {
12244
+ if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
12245
+ else xx = shifts[ib]%2 == 0 ? x_p : x_m;
12246
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
12247
+ for (int j = 0; j < 8; ++j) {
12248
+ float w = weight[8*k + j];
12249
+ float q = xx[(pg[j] - 1)/2]*(2*l+1);
12250
+ sumqx_f += w*q*xb[8*k+j];
12251
+ sumq2_f += w*q*q;
12252
+ }
12253
+ }
12254
+ }
12255
+ if (sumq2_f > 0) d = sumqx_f/sumq2_f;
12256
+ s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
12257
+ #if QK_K == 64
12258
+ y[ibl].d = s.f16;
12259
+ #else
12260
+ sc[0] |= ((s.u16 & 0x000f) << 12);
12261
+ sc[1] |= ((s.u16 & 0x00f0) << 8);
12262
+ sc[2] |= ((s.u16 & 0x0f00) << 4);
12263
+ sc[3] |= ((s.u16 & 0xf000) << 0);
12264
+ #endif
12265
+ }
12266
+ }
12267
+
12268
+ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
12269
+ GGML_ASSERT(n_per_row%QK_K == 0);
12270
+ float scales[QK_K/IQ1M_BLOCK_SIZE];
12271
+ float weight[IQ1M_BLOCK_SIZE];
12272
+ int8_t L[IQ1M_BLOCK_SIZE];
12273
+ float pairs[2*IQ1M_BLOCK_SIZE];
12274
+ uint16_t index[IQ1M_BLOCK_SIZE/8];
12275
+ int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
12276
+ int nblock = n_per_row/QK_K;
12277
+ char * qrow = (char *)dst;
12278
+ for (int row = 0; row < nrow; ++row) {
12279
+ quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
12280
+ src += n_per_row;
12281
+ qrow += nblock*sizeof(block_iq1_m);
12282
+ }
12283
+ return nrow * nblock * sizeof(block_iq1_m);
12284
+ }
12285
+
11691
12286
  // ============================ 4-bit non-linear quants
11692
12287
 
11693
12288
  static inline int best_index_int8(int n, const int8_t * val, float x) {
@@ -11705,9 +12300,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11705
12300
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
11706
12301
  float * scales, float * weight, uint8_t * L,
11707
12302
  const int8_t * values,
11708
- const float * quant_weights) {
11709
-
11710
- const int ntry = 7;
12303
+ const float * quant_weights,
12304
+ const int ntry) {
11711
12305
 
11712
12306
  float sigma2 = 0;
11713
12307
  for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
@@ -11719,6 +12313,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11719
12313
  float max_scale = 0, amax_scale = 0;
11720
12314
  for (int ib = 0; ib < super_block_size/block_size; ++ib) {
11721
12315
  const float * xb = x + ib*block_size;
12316
+ uint8_t * Lb = L + ib*block_size;
11722
12317
  if (quant_weights) {
11723
12318
  const float * qw = quant_weights + ib*block_size;
11724
12319
  for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
@@ -11736,12 +12331,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11736
12331
  scales[ib] = 0;
11737
12332
  continue;
11738
12333
  }
11739
- float d = -max/values[0];
12334
+ float d = ntry > 0 ? -max/values[0] : max/values[0];
11740
12335
  float id = 1/d;
11741
12336
  float sumqx = 0, sumq2 = 0;
11742
12337
  for (int j = 0; j < block_size; ++j) {
11743
12338
  float al = id*xb[j];
11744
12339
  int l = best_index_int8(16, values, al);
12340
+ Lb[j] = l;
11745
12341
  float q = values[l];
11746
12342
  float w = weight[j];
11747
12343
  sumqx += w*q*xb[j];
@@ -11796,9 +12392,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11796
12392
  }
11797
12393
  } else {
11798
12394
  dh[0] = GGML_FP32_TO_FP16(scales[0]);
11799
- float id = scales[0] ? 1/scales[0] : 0;
11800
- for (int j = 0; j < super_block_size; ++j) {
11801
- L[j] = best_index_int8(16, values, id*x[j]);
12395
+ if (ntry > 0) {
12396
+ float id = scales[0] ? 1/scales[0] : 0;
12397
+ for (int j = 0; j < super_block_size; ++j) {
12398
+ L[j] = best_index_int8(16, values, id*x[j]);
12399
+ }
11802
12400
  }
11803
12401
  }
11804
12402
 
@@ -11823,7 +12421,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
11823
12421
  for (int ibl = 0; ibl < nblock; ++ibl) {
11824
12422
  const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
11825
12423
  quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
11826
- &scale, weight, L, kvalues_iq4nl, qw);
12424
+ &scale, weight, L, kvalues_iq4nl, qw, 7);
11827
12425
  }
11828
12426
  src += n_per_row;
11829
12427
  qrow += nblock*sizeof(block_iq4_nl);
@@ -11832,14 +12430,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
11832
12430
  }
11833
12431
 
11834
12432
  void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
11835
- assert(k % QK4_NL == 0);
11836
- block_iq4_nl * restrict y = vy;
11837
- quantize_row_iq4_nl_reference(x, y, k);
12433
+ GGML_ASSERT(k%QK4_NL == 0);
12434
+ int nblock = k/QK4_NL;
12435
+ uint8_t L[QK4_NL];
12436
+ float weight[QK4_NL];
12437
+ uint16_t unused_h;
12438
+ uint8_t * unused_l = NULL;
12439
+ float scale;
12440
+ block_iq4_nl * iq4 = (block_iq4_nl *)vy;
12441
+ for (int ibl = 0; ibl < nblock; ++ibl) {
12442
+ quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
12443
+ &scale, weight, L, kvalues_iq4nl, NULL, -1);
12444
+ }
11838
12445
  }
11839
12446
 
11840
12447
  void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
11841
12448
  assert(k % QK4_NL == 0);
11842
- quantize_iq4_nl(x, y, 1, k, NULL);
12449
+ quantize_row_iq4_nl(x, y, k);
11843
12450
  }
11844
12451
 
11845
12452
  size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
@@ -11857,7 +12464,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
11857
12464
  for (int ibl = 0; ibl < nblock; ++ibl) {
11858
12465
  const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
11859
12466
  quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
11860
- scales, weight, L, kvalues_iq4nl, qw);
12467
+ scales, weight, L, kvalues_iq4nl, qw, 7);
11861
12468
  }
11862
12469
  src += n_per_row;
11863
12470
  qrow += nblock*sizeof(block_iq4_xs);