llama_cpp 0.14.3 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
| @@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) { | |
| 132 132 | 
             
            }
         | 
| 133 133 |  | 
| 134 134 | 
             
            static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
         | 
| 135 | 
            -
            #if __AVXVNNI__
         | 
| 135 | 
            +
            #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
         | 
| 136 136 | 
             
                const __m256i zero = _mm256_setzero_si256();
         | 
| 137 137 | 
             
                const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
         | 
| 138 138 | 
             
                return _mm256_cvtepi32_ps(summed_pairs);
         | 
| @@ -3474,6 +3474,65 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in | |
| 3474 3474 | 
             
                }
         | 
| 3475 3475 | 
             
            }
         | 
| 3476 3476 |  | 
| 3477 | 
            +
            void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
         | 
| 3478 | 
            +
                assert(k % QK_K == 0);
         | 
| 3479 | 
            +
                const int nb = k / QK_K;
         | 
| 3480 | 
            +
             | 
| 3481 | 
            +
                float delta[4];
         | 
| 3482 | 
            +
                uint16_t idx[4];
         | 
| 3483 | 
            +
             | 
| 3484 | 
            +
            #if QK_K != 64
         | 
| 3485 | 
            +
                iq1m_scale_t scale;
         | 
| 3486 | 
            +
            #endif
         | 
| 3487 | 
            +
             | 
| 3488 | 
            +
                for (int i = 0; i < nb; i++) {
         | 
| 3489 | 
            +
             | 
| 3490 | 
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         | 
| 3491 | 
            +
            #if QK_K == 64
         | 
| 3492 | 
            +
                    const float d = GGML_FP16_TO_FP32(x[i].d);
         | 
| 3493 | 
            +
            #else
         | 
| 3494 | 
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         | 
| 3495 | 
            +
                    const float d = GGML_FP16_TO_FP32(scale.f16);
         | 
| 3496 | 
            +
            #endif
         | 
| 3497 | 
            +
                    const uint8_t * qs = x[i].qs;
         | 
| 3498 | 
            +
                    const uint8_t * qh = x[i].qh;
         | 
| 3499 | 
            +
             | 
| 3500 | 
            +
                    for (int ib = 0; ib < QK_K/32; ++ib) {
         | 
| 3501 | 
            +
            #if QK_K == 64
         | 
| 3502 | 
            +
                        const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1);
         | 
| 3503 | 
            +
                        const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1);
         | 
| 3504 | 
            +
            #else
         | 
| 3505 | 
            +
                        const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
         | 
| 3506 | 
            +
                        const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
         | 
| 3507 | 
            +
            #endif
         | 
| 3508 | 
            +
                        idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
         | 
| 3509 | 
            +
                        idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
         | 
| 3510 | 
            +
                        idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
         | 
| 3511 | 
            +
                        idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
         | 
| 3512 | 
            +
                        delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
         | 
| 3513 | 
            +
                        delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
         | 
| 3514 | 
            +
                        delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
         | 
| 3515 | 
            +
                        delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
         | 
| 3516 | 
            +
                        for (int l = 0; l < 2; ++l) {
         | 
| 3517 | 
            +
                            const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
         | 
| 3518 | 
            +
                            for (int j = 0; j < 8; ++j) {
         | 
| 3519 | 
            +
                                y[j] = dl1 * (grid[j] + delta[l]);
         | 
| 3520 | 
            +
                            }
         | 
| 3521 | 
            +
                            y += 8;
         | 
| 3522 | 
            +
                        }
         | 
| 3523 | 
            +
                        for (int l = 2; l < 4; ++l) {
         | 
| 3524 | 
            +
                            const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
         | 
| 3525 | 
            +
                            for (int j = 0; j < 8; ++j) {
         | 
| 3526 | 
            +
                                y[j] = dl2 * (grid[j] + delta[l]);
         | 
| 3527 | 
            +
                            }
         | 
| 3528 | 
            +
                            y += 8;
         | 
| 3529 | 
            +
                        }
         | 
| 3530 | 
            +
                        qs += 4;
         | 
| 3531 | 
            +
                        qh += 2;
         | 
| 3532 | 
            +
                    }
         | 
| 3533 | 
            +
                }
         | 
| 3534 | 
            +
            }
         | 
| 3535 | 
            +
             | 
| 3477 3536 | 
             
            static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
         | 
| 3478 3537 |  | 
| 3479 3538 | 
             
            void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
         | 
| @@ -9695,6 +9754,248 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void | |
| 9695 9754 | 
             
            #endif
         | 
| 9696 9755 | 
             
            }
         | 
| 9697 9756 |  | 
| 9757 | 
            +
            void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
         | 
| 9758 | 
            +
                assert(n % QK_K == 0);
         | 
| 9759 | 
            +
                assert(nrc == 1);
         | 
| 9760 | 
            +
                UNUSED(nrc);
         | 
| 9761 | 
            +
                UNUSED(bx);
         | 
| 9762 | 
            +
                UNUSED(by);
         | 
| 9763 | 
            +
                UNUSED(bs);
         | 
| 9764 | 
            +
             | 
| 9765 | 
            +
                const block_iq1_m * restrict x = vx;
         | 
| 9766 | 
            +
                const block_q8_K  * restrict y = vy;
         | 
| 9767 | 
            +
             | 
| 9768 | 
            +
                const int nb = n / QK_K;
         | 
| 9769 | 
            +
             | 
| 9770 | 
            +
            #if QK_K != 64
         | 
| 9771 | 
            +
                iq1m_scale_t scale;
         | 
| 9772 | 
            +
            #endif
         | 
| 9773 | 
            +
             | 
| 9774 | 
            +
            #if defined __ARM_NEON
         | 
| 9775 | 
            +
             | 
| 9776 | 
            +
            #if QK_K == 64
         | 
| 9777 | 
            +
                const int32x4_t mask  = vdupq_n_s32(0xf);
         | 
| 9778 | 
            +
            #else
         | 
| 9779 | 
            +
                const int32x4_t mask  = vdupq_n_s32(0x7);
         | 
| 9780 | 
            +
            #endif
         | 
| 9781 | 
            +
                const int32x4_t mone  = vdupq_n_s32(1);
         | 
| 9782 | 
            +
                const int32x4_t mzero = vdupq_n_s32(0);
         | 
| 9783 | 
            +
             | 
| 9784 | 
            +
                ggml_int8x16x4_t deltas;
         | 
| 9785 | 
            +
                deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
         | 
| 9786 | 
            +
                deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
         | 
| 9787 | 
            +
                deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
         | 
| 9788 | 
            +
                deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
         | 
| 9789 | 
            +
             | 
| 9790 | 
            +
                ggml_int8x16x4_t q1b;
         | 
| 9791 | 
            +
                ggml_int8x16x4_t q8b;
         | 
| 9792 | 
            +
             | 
| 9793 | 
            +
                uint32_t aux32;
         | 
| 9794 | 
            +
                const uint8_t * aux8 = (const uint8_t *)&aux32;
         | 
| 9795 | 
            +
             | 
| 9796 | 
            +
                float sumf = 0;
         | 
| 9797 | 
            +
                for (int i = 0; i < nb; ++i) {
         | 
| 9798 | 
            +
             | 
| 9799 | 
            +
                    const int8_t   * q8 = y[i].qs;
         | 
| 9800 | 
            +
                    const uint8_t  * qs = x[i].qs;
         | 
| 9801 | 
            +
                    const uint8_t  * qh = x[i].qh;
         | 
| 9802 | 
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         | 
| 9803 | 
            +
             | 
| 9804 | 
            +
            #if QK_K != 64
         | 
| 9805 | 
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         | 
| 9806 | 
            +
            #endif
         | 
| 9807 | 
            +
             | 
| 9808 | 
            +
                    int32x4_t sumi1 = mzero;
         | 
| 9809 | 
            +
                    int32x4_t sumi2 = mzero;
         | 
| 9810 | 
            +
             | 
| 9811 | 
            +
                    for (int ib = 0; ib < QK_K/32; ib += 2) {
         | 
| 9812 | 
            +
             | 
| 9813 | 
            +
                        q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
         | 
| 9814 | 
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
         | 
| 9815 | 
            +
                        q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
         | 
| 9816 | 
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
         | 
| 9817 | 
            +
                        q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
         | 
| 9818 | 
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
         | 
| 9819 | 
            +
                        q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
         | 
| 9820 | 
            +
                                                 vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
         | 
| 9821 | 
            +
             | 
| 9822 | 
            +
                        q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
         | 
| 9823 | 
            +
             | 
| 9824 | 
            +
                        const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
         | 
| 9825 | 
            +
                        const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
         | 
| 9826 | 
            +
                        const int32x4_t p12 = vpaddq_s32(p1, p2);
         | 
| 9827 | 
            +
             | 
| 9828 | 
            +
                        const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
         | 
| 9829 | 
            +
                        aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
         | 
| 9830 | 
            +
             | 
| 9831 | 
            +
                        const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
         | 
| 9832 | 
            +
                        const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
         | 
| 9833 | 
            +
                        const int32x4_t p34 = vpaddq_s32(p3, p4);
         | 
| 9834 | 
            +
             | 
| 9835 | 
            +
            #if QK_K == 64
         | 
| 9836 | 
            +
                        int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12);
         | 
| 9837 | 
            +
            #else
         | 
| 9838 | 
            +
                        int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
         | 
| 9839 | 
            +
            #endif
         | 
| 9840 | 
            +
                        scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
         | 
| 9841 | 
            +
             | 
| 9842 | 
            +
                        sumi1 = vmlaq_s32(sumi1, scales_4, p12);
         | 
| 9843 | 
            +
                        sumi2 = vmlaq_s32(sumi2, scales_4, p34);
         | 
| 9844 | 
            +
             | 
| 9845 | 
            +
                        qs += 8; qh += 4;
         | 
| 9846 | 
            +
             | 
| 9847 | 
            +
                    }
         | 
| 9848 | 
            +
             | 
| 9849 | 
            +
            #if QK_K == 64
         | 
| 9850 | 
            +
                    sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
         | 
| 9851 | 
            +
            #else
         | 
| 9852 | 
            +
                    sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
         | 
| 9853 | 
            +
            #endif
         | 
| 9854 | 
            +
                }
         | 
| 9855 | 
            +
             | 
| 9856 | 
            +
                *s = sumf;
         | 
| 9857 | 
            +
             | 
| 9858 | 
            +
            #elif defined __AVX2__
         | 
| 9859 | 
            +
             | 
| 9860 | 
            +
            #if QK_K == 64
         | 
| 9861 | 
            +
                const __m256i mask = _mm256_set1_epi16(0xf);
         | 
| 9862 | 
            +
            #else
         | 
| 9863 | 
            +
                const __m256i mask = _mm256_set1_epi16(0x7);
         | 
| 9864 | 
            +
            #endif
         | 
| 9865 | 
            +
                const __m256i mone = _mm256_set1_epi16(1);
         | 
| 9866 | 
            +
             | 
| 9867 | 
            +
                __m256 accum1 = _mm256_setzero_ps();
         | 
| 9868 | 
            +
                __m256 accum2 = _mm256_setzero_ps();
         | 
| 9869 | 
            +
                for (int i = 0; i < nb; ++i) {
         | 
| 9870 | 
            +
             | 
| 9871 | 
            +
                    const int8_t   * q8 = y[i].qs;
         | 
| 9872 | 
            +
                    const uint8_t  * qs = x[i].qs;
         | 
| 9873 | 
            +
                    const uint8_t  * qh = x[i].qh;
         | 
| 9874 | 
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         | 
| 9875 | 
            +
             | 
| 9876 | 
            +
            #if QK_K != 64
         | 
| 9877 | 
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         | 
| 9878 | 
            +
            #endif
         | 
| 9879 | 
            +
             | 
| 9880 | 
            +
                    __m256i sumi1 = _mm256_setzero_si256();
         | 
| 9881 | 
            +
                    __m256i sumi2 = _mm256_setzero_si256();
         | 
| 9882 | 
            +
                    for (int ib = 0; ib < QK_K/32; ib += 2) {
         | 
| 9883 | 
            +
                        const __m256i q1b_1 = _mm256_set_epi64x(
         | 
| 9884 | 
            +
                                iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
         | 
| 9885 | 
            +
                                iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
         | 
| 9886 | 
            +
                        );
         | 
| 9887 | 
            +
                        const __m256i q1b_2 = _mm256_set_epi64x(
         | 
| 9888 | 
            +
                                iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
         | 
| 9889 | 
            +
                                iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
         | 
| 9890 | 
            +
                        );
         | 
| 9891 | 
            +
                        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
         | 
| 9892 | 
            +
                        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
         | 
| 9893 | 
            +
             | 
| 9894 | 
            +
                        const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
         | 
| 9895 | 
            +
                        const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
         | 
| 9896 | 
            +
             | 
| 9897 | 
            +
                        const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         | 
| 9898 | 
            +
                                                                 qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
         | 
| 9899 | 
            +
                                                                 qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         | 
| 9900 | 
            +
                                                                 qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
         | 
| 9901 | 
            +
                        const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         | 
| 9902 | 
            +
                                                                 qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
         | 
| 9903 | 
            +
                                                                 qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
         | 
| 9904 | 
            +
                                                                 qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
         | 
| 9905 | 
            +
             | 
| 9906 | 
            +
                        const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
         | 
| 9907 | 
            +
                        const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
         | 
| 9908 | 
            +
            #if QK_K == 64
         | 
| 9909 | 
            +
                        __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >>  4), _mm_set1_epi16(sc[0] >> 0));
         | 
| 9910 | 
            +
                        __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8));
         | 
| 9911 | 
            +
            #else
         | 
| 9912 | 
            +
                        __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
         | 
| 9913 | 
            +
                        __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
         | 
| 9914 | 
            +
            #endif
         | 
| 9915 | 
            +
                        scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
         | 
| 9916 | 
            +
                        scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
         | 
| 9917 | 
            +
                        const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
         | 
| 9918 | 
            +
                        const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
         | 
| 9919 | 
            +
                        const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
         | 
| 9920 | 
            +
                        const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
         | 
| 9921 | 
            +
             | 
| 9922 | 
            +
                        sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
         | 
| 9923 | 
            +
                        sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
         | 
| 9924 | 
            +
             | 
| 9925 | 
            +
                        qs += 8; qh += 4;
         | 
| 9926 | 
            +
                    }
         | 
| 9927 | 
            +
             | 
| 9928 | 
            +
            #if QK_K == 64
         | 
| 9929 | 
            +
                    const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
         | 
| 9930 | 
            +
            #else
         | 
| 9931 | 
            +
                    const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
         | 
| 9932 | 
            +
            #endif
         | 
| 9933 | 
            +
                    accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
         | 
| 9934 | 
            +
                    accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
         | 
| 9935 | 
            +
             | 
| 9936 | 
            +
                }
         | 
| 9937 | 
            +
             | 
| 9938 | 
            +
                *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
         | 
| 9939 | 
            +
             | 
| 9940 | 
            +
            #else
         | 
| 9941 | 
            +
             | 
| 9942 | 
            +
                int sum1[2], sum2[2], delta[4];
         | 
| 9943 | 
            +
             | 
| 9944 | 
            +
                float sumf = 0;
         | 
| 9945 | 
            +
                for (int i = 0; i < nb; i++) {
         | 
| 9946 | 
            +
             | 
| 9947 | 
            +
                    const int8_t   * q8 = y[i].qs;
         | 
| 9948 | 
            +
                    const uint8_t  * qs = x[i].qs;
         | 
| 9949 | 
            +
                    const uint8_t  * qh = x[i].qh;
         | 
| 9950 | 
            +
                    const uint16_t * sc = (const uint16_t *)x[i].scales;
         | 
| 9951 | 
            +
             | 
| 9952 | 
            +
            #if QK_K != 64
         | 
| 9953 | 
            +
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         | 
| 9954 | 
            +
            #endif
         | 
| 9955 | 
            +
             | 
| 9956 | 
            +
                    int sumi1 = 0, sumi2 = 0;
         | 
| 9957 | 
            +
                    for (int ib = 0; ib < QK_K/32; ++ib) {
         | 
| 9958 | 
            +
                        delta[0] = qh[0] & 0x08 ? -1 : 1;
         | 
| 9959 | 
            +
                        delta[1] = qh[0] & 0x80 ? -1 : 1;
         | 
| 9960 | 
            +
                        delta[2] = qh[1] & 0x08 ? -1 : 1;
         | 
| 9961 | 
            +
                        delta[3] = qh[1] & 0x80 ? -1 : 1;
         | 
| 9962 | 
            +
                        sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
         | 
| 9963 | 
            +
                        for (int l = 0; l < 4; ++l) {
         | 
| 9964 | 
            +
                            const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
         | 
| 9965 | 
            +
                            int lsum1 = 0, lsum2 = 0;
         | 
| 9966 | 
            +
                            for (int j = 0; j < 8; ++j) {
         | 
| 9967 | 
            +
                                lsum1 += q8[j] * grid[j];
         | 
| 9968 | 
            +
                                lsum2 += q8[j];
         | 
| 9969 | 
            +
                            }
         | 
| 9970 | 
            +
                            q8 += 8;
         | 
| 9971 | 
            +
                            sum1[l/2] += lsum1;
         | 
| 9972 | 
            +
                            sum2[l/2] += lsum2*delta[l];
         | 
| 9973 | 
            +
                        }
         | 
| 9974 | 
            +
            #if QK_K == 64
         | 
| 9975 | 
            +
                        const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1;
         | 
| 9976 | 
            +
                        const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1;
         | 
| 9977 | 
            +
            #else
         | 
| 9978 | 
            +
                        const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
         | 
| 9979 | 
            +
                        const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
         | 
| 9980 | 
            +
            #endif
         | 
| 9981 | 
            +
                        sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
         | 
| 9982 | 
            +
                        sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
         | 
| 9983 | 
            +
                        qs += 4;
         | 
| 9984 | 
            +
                        qh += 2;
         | 
| 9985 | 
            +
                    }
         | 
| 9986 | 
            +
             | 
| 9987 | 
            +
            #if QK_K == 64
         | 
| 9988 | 
            +
                    sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
         | 
| 9989 | 
            +
            #else
         | 
| 9990 | 
            +
                    sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
         | 
| 9991 | 
            +
            #endif
         | 
| 9992 | 
            +
                }
         | 
| 9993 | 
            +
             | 
| 9994 | 
            +
                *s = sumf;
         | 
| 9995 | 
            +
             | 
| 9996 | 
            +
            #endif
         | 
| 9997 | 
            +
            }
         | 
| 9998 | 
            +
             | 
| 9698 9999 | 
             
            void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
         | 
| 9699 10000 | 
             
                assert(nrc == 1);
         | 
| 9700 10001 | 
             
                UNUSED(nrc);
         | 
| @@ -9938,17 +10239,17 @@ static iq2_entry_t iq2_data[4] = { | |
| 9938 10239 | 
             
            };
         | 
| 9939 10240 |  | 
| 9940 10241 | 
             
            static inline int iq2_data_index(enum ggml_type type) {
         | 
| 9941 | 
            -
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
         | 
| 10242 | 
            +
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
         | 
| 9942 10243 | 
             
                return type == GGML_TYPE_IQ2_XXS ? 0 :
         | 
| 9943 10244 | 
             
                       type == GGML_TYPE_IQ2_XS  ? 1 :
         | 
| 9944 | 
            -
                       type == GGML_TYPE_IQ1_S | 
| 10245 | 
            +
                       type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
         | 
| 9945 10246 | 
             
            }
         | 
| 9946 10247 |  | 
| 9947 10248 | 
             
            static inline int iq2_grid_size(enum ggml_type type) {
         | 
| 9948 | 
            -
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
         | 
| 10249 | 
            +
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
         | 
| 9949 10250 | 
             
                return type == GGML_TYPE_IQ2_XXS ? 256 :
         | 
| 9950 10251 | 
             
                       type == GGML_TYPE_IQ2_XS  ? 512 :
         | 
| 9951 | 
            -
                       type == GGML_TYPE_IQ1_S | 
| 10252 | 
            +
                       type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
         | 
| 9952 10253 | 
             
            }
         | 
| 9953 10254 |  | 
| 9954 10255 | 
             
            static int iq2_compare_func(const void * left, const void * right) {
         | 
| @@ -10214,10 +10515,10 @@ void iq2xs_init_impl(enum ggml_type type) { | |
| 10214 10515 |  | 
| 10215 10516 | 
             
                const int kmap_size = 43692;
         | 
| 10216 10517 | 
             
                //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
         | 
| 10217 | 
            -
                const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
         | 
| 10518 | 
            +
                const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
         | 
| 10218 10519 | 
             
                const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
         | 
| 10219 10520 | 
             
                                         type == GGML_TYPE_IQ2_XS  ? kgrid_2bit_512 :
         | 
| 10220 | 
            -
                                         type == GGML_TYPE_IQ1_S | 
| 10521 | 
            +
                                         type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
         | 
| 10221 10522 | 
             
                uint64_t * kgrid_q2xs;
         | 
| 10222 10523 | 
             
                int      * kmap_q2xs;
         | 
| 10223 10524 | 
             
                uint16_t * kneighbors_q2xs;
         | 
| @@ -10314,7 +10615,7 @@ void iq2xs_init_impl(enum ggml_type type) { | |
| 10314 10615 | 
             
            }
         | 
| 10315 10616 |  | 
| 10316 10617 | 
             
            void iq2xs_free_impl(enum ggml_type type) {
         | 
| 10317 | 
            -
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
         | 
| 10618 | 
            +
                GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
         | 
| 10318 10619 | 
             
                const int gindex = iq2_data_index(type);
         | 
| 10319 10620 | 
             
                if (iq2_data[gindex].grid) {
         | 
| 10320 10621 | 
             
                    free(iq2_data[gindex].grid);       iq2_data[gindex].grid = NULL;
         | 
| @@ -11520,7 +11821,16 @@ static int iq1_sort_helper(const void * left, const void * right) { | |
| 11520 11821 | 
             
            }
         | 
| 11521 11822 |  | 
| 11522 11823 | 
             
            #define IQ1S_BLOCK_SIZE 32
         | 
| 11523 | 
            -
             | 
| 11824 | 
            +
            #define IQ1M_BLOCK_SIZE 16
         | 
| 11825 | 
            +
            static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
         | 
| 11826 | 
            +
                    float    * scales,
         | 
| 11827 | 
            +
                    float    * weight,
         | 
| 11828 | 
            +
                    float    * sumx,
         | 
| 11829 | 
            +
                    float    * sumw,
         | 
| 11830 | 
            +
                    float    * pairs,
         | 
| 11831 | 
            +
                    int8_t   * L,
         | 
| 11832 | 
            +
                    uint16_t * index,
         | 
| 11833 | 
            +
                    int8_t   * shifts) {
         | 
| 11524 11834 |  | 
| 11525 11835 | 
             
                const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
         | 
| 11526 11836 |  | 
| @@ -11534,22 +11844,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11534 11844 | 
             
                GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
         | 
| 11535 11845 | 
             
                GGML_ASSERT(n%QK_K == 0);
         | 
| 11536 11846 |  | 
| 11847 | 
            +
                block_iq1_s * y = vy;
         | 
| 11848 | 
            +
             | 
| 11537 11849 | 
             
                const int nbl = n/QK_K;
         | 
| 11538 11850 |  | 
| 11539 | 
            -
                 | 
| 11851 | 
            +
                const int block_size = IQ1S_BLOCK_SIZE;
         | 
| 11540 11852 |  | 
| 11541 11853 | 
             
                const float x_p[3] = {-1 + IQ1S_DELTA,  IQ1S_DELTA, 1 + IQ1S_DELTA};
         | 
| 11542 11854 | 
             
                const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
         | 
| 11543 11855 |  | 
| 11544 | 
            -
             | 
| 11545 | 
            -
                float  weight[IQ1S_BLOCK_SIZE];
         | 
| 11546 | 
            -
                int8_t L[IQ1S_BLOCK_SIZE];
         | 
| 11547 | 
            -
                float  sumx[IQ1S_BLOCK_SIZE+1];
         | 
| 11548 | 
            -
                float  sumw[IQ1S_BLOCK_SIZE+1];
         | 
| 11549 | 
            -
                float  pairs[2*IQ1S_BLOCK_SIZE];
         | 
| 11856 | 
            +
             | 
| 11550 11857 | 
             
                int * idx = (int *)(pairs + 1);
         | 
| 11551 | 
            -
                uint16_t index[IQ1S_BLOCK_SIZE/8];
         | 
| 11552 | 
            -
                int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
         | 
| 11553 11858 |  | 
| 11554 11859 | 
             
                for (int ibl = 0; ibl < nbl; ++ibl) {
         | 
| 11555 11860 |  | 
| @@ -11564,15 +11869,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11564 11869 | 
             
                    for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
         | 
| 11565 11870 | 
             
                    float sigma2 = 2*sumx2/QK_K;
         | 
| 11566 11871 |  | 
| 11567 | 
            -
                    for (int ib = 0; ib < QK_K/ | 
| 11568 | 
            -
                        const float * xb = xbl +  | 
| 11569 | 
            -
                        const float * qw = quant_weights + QK_K*ibl +  | 
| 11570 | 
            -
                        for (int i = 0; i <  | 
| 11872 | 
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         | 
| 11873 | 
            +
                        const float * xb = xbl + block_size*ib;
         | 
| 11874 | 
            +
                        const float * qw = quant_weights + QK_K*ibl + block_size*ib;
         | 
| 11875 | 
            +
                        for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
         | 
| 11571 11876 | 
             
                        float max = fabsf(xb[0]);
         | 
| 11572 | 
            -
                        for (int i = 1; i <  | 
| 11877 | 
            +
                        for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
         | 
| 11573 11878 | 
             
                        if (!max) {
         | 
| 11574 11879 | 
             
                            scales[ib] = 0;
         | 
| 11575 | 
            -
                            memset(L, 1,  | 
| 11880 | 
            +
                            memset(L, 1, block_size);
         | 
| 11576 11881 | 
             
                            continue;
         | 
| 11577 11882 | 
             
                        }
         | 
| 11578 11883 | 
             
                        // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
         | 
| @@ -11581,14 +11886,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11581 11886 | 
             
                        // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
         | 
| 11582 11887 | 
             
                        // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
         | 
| 11583 11888 | 
             
                        // for each possible and score for each split.
         | 
| 11584 | 
            -
                        for (int j = 0; j <  | 
| 11889 | 
            +
                        for (int j = 0; j < block_size; ++j) {
         | 
| 11585 11890 | 
             
                            pairs[2*j] = xb[j];
         | 
| 11586 11891 | 
             
                            idx[2*j] = j;
         | 
| 11587 11892 | 
             
                        }
         | 
| 11588 | 
            -
                        qsort(pairs,  | 
| 11893 | 
            +
                        qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
         | 
| 11589 11894 | 
             
                        {
         | 
| 11590 11895 | 
             
                            sumx[0] = sumw[0] = 0;
         | 
| 11591 | 
            -
                            for (int j = 0; j <  | 
| 11896 | 
            +
                            for (int j = 0; j < block_size; ++j) {
         | 
| 11592 11897 | 
             
                                int i = idx[2*j];
         | 
| 11593 11898 | 
             
                                sumx[j+1] = sumx[j] + weight[i]*xb[i];
         | 
| 11594 11899 | 
             
                                sumw[j+1] = sumw[j] + weight[i];
         | 
| @@ -11596,16 +11901,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11596 11901 | 
             
                        }
         | 
| 11597 11902 | 
             
                        float best_score = 0, scale = max;
         | 
| 11598 11903 | 
             
                        int besti1 = -1, besti2 = -1, best_shift = 0;
         | 
| 11599 | 
            -
                        for (int i1 = 0; i1 <=  | 
| 11600 | 
            -
                            for (int i2 = i1; i2 <=  | 
| 11601 | 
            -
                                float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[ | 
| 11602 | 
            -
                                float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[ | 
| 11904 | 
            +
                        for (int i1 = 0; i1 <= block_size; ++i1) {
         | 
| 11905 | 
            +
                            for (int i2 = i1; i2 <= block_size; ++i2) {
         | 
| 11906 | 
            +
                                float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
         | 
| 11907 | 
            +
                                float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
         | 
| 11603 11908 | 
             
                                if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
         | 
| 11604 11909 | 
             
                                    scale = sumqx/sumq2; best_score = scale*sumqx;
         | 
| 11605 11910 | 
             
                                    besti1 = i1; besti2 = i2; best_shift = 1;
         | 
| 11606 11911 | 
             
                                }
         | 
| 11607 | 
            -
                                sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[ | 
| 11608 | 
            -
                                sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[ | 
| 11912 | 
            +
                                sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
         | 
| 11913 | 
            +
                                sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
         | 
| 11609 11914 | 
             
                                if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
         | 
| 11610 11915 | 
             
                                    scale = sumqx/sumq2; best_score = scale*sumqx;
         | 
| 11611 11916 | 
             
                                    besti1 = i1; besti2 = i2; best_shift = -1;
         | 
| @@ -11615,14 +11920,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11615 11920 | 
             
                        GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
         | 
| 11616 11921 | 
             
                        for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
         | 
| 11617 11922 | 
             
                        for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
         | 
| 11618 | 
            -
                        for (int j = besti2; j <  | 
| 11923 | 
            +
                        for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
         | 
| 11619 11924 | 
             
                        if (scale < 0) {
         | 
| 11620 | 
            -
                            for (int j = 0; j <  | 
| 11925 | 
            +
                            for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
         | 
| 11621 11926 | 
             
                            scale = -scale; best_shift = -best_shift;
         | 
| 11622 11927 | 
             
                        }
         | 
| 11623 11928 | 
             
                        bool all_on_grid = true;
         | 
| 11624 11929 | 
             
                        const float * xx = best_shift == 1 ? x_p : x_m;
         | 
| 11625 | 
            -
                        for (int k = 0; k <  | 
| 11930 | 
            +
                        for (int k = 0; k < block_size/8; ++k) {
         | 
| 11626 11931 | 
             
                            uint16_t u = 0;
         | 
| 11627 11932 | 
             
                            for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
         | 
| 11628 11933 | 
             
                            int grid_index = kmap_q2xs[u];
         | 
| @@ -11636,7 +11941,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11636 11941 | 
             
                        }
         | 
| 11637 11942 | 
             
                        if (!all_on_grid) {
         | 
| 11638 11943 | 
             
                            float sumqx = 0, sumq2 = 0;
         | 
| 11639 | 
            -
                            for (int k = 0; k <  | 
| 11944 | 
            +
                            for (int k = 0; k < block_size/8; ++k) {
         | 
| 11640 11945 | 
             
                                const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
         | 
| 11641 11946 | 
             
                                for (int j = 0; j < 8; ++j) {
         | 
| 11642 11947 | 
             
                                    float w = weight[8*k + j];
         | 
| @@ -11648,8 +11953,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11648 11953 | 
             
                            if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
         | 
| 11649 11954 | 
             
                        }
         | 
| 11650 11955 | 
             
                        uint16_t h = 0;
         | 
| 11651 | 
            -
                        for (int k = 0; k <  | 
| 11652 | 
            -
                            y[ibl].qs[( | 
| 11956 | 
            +
                        for (int k = 0; k < block_size/8; ++k) {
         | 
| 11957 | 
            +
                            y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
         | 
| 11653 11958 | 
             
                            h |= (index[k] >> 8) << 3*k;
         | 
| 11654 11959 | 
             
                        }
         | 
| 11655 11960 | 
             
                        y[ibl].qh[ib] = h;
         | 
| @@ -11660,14 +11965,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11660 11965 | 
             
                    }
         | 
| 11661 11966 |  | 
| 11662 11967 | 
             
                    if (!max_scale) {
         | 
| 11663 | 
            -
                        memset(y[ibl].qs, 0, QK_K/8);
         | 
| 11664 11968 | 
             
                        continue;
         | 
| 11665 11969 | 
             
                    }
         | 
| 11666 11970 |  | 
| 11667 11971 | 
             
                    float d = max_scale/15;
         | 
| 11668 | 
            -
                    y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1. | 
| 11972 | 
            +
                    y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
         | 
| 11669 11973 | 
             
                    float id = 1/d;
         | 
| 11670 | 
            -
                    for (int ib = 0; ib < QK_K/ | 
| 11974 | 
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         | 
| 11671 11975 | 
             
                        int l = nearest_int(0.5f*(id*scales[ib]-1));
         | 
| 11672 11976 | 
             
                        l = MAX(0, MIN(7, l));
         | 
| 11673 11977 | 
             
                        if (shifts[ib] == -1) l |= 8;
         | 
| @@ -11678,16 +11982,307 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy | |
| 11678 11982 |  | 
| 11679 11983 | 
             
            size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
         | 
| 11680 11984 | 
             
                GGML_ASSERT(n_per_row%QK_K == 0);
         | 
| 11985 | 
            +
                float  scales[QK_K/IQ1S_BLOCK_SIZE];
         | 
| 11986 | 
            +
                float  weight[IQ1S_BLOCK_SIZE];
         | 
| 11987 | 
            +
                int8_t L[IQ1S_BLOCK_SIZE];
         | 
| 11988 | 
            +
                float  sumx[IQ1S_BLOCK_SIZE+1];
         | 
| 11989 | 
            +
                float  sumw[IQ1S_BLOCK_SIZE+1];
         | 
| 11990 | 
            +
                float  pairs[2*IQ1S_BLOCK_SIZE];
         | 
| 11991 | 
            +
                uint16_t index[IQ1S_BLOCK_SIZE/8];
         | 
| 11992 | 
            +
                int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
         | 
| 11681 11993 | 
             
                int nblock = n_per_row/QK_K;
         | 
| 11682 11994 | 
             
                char * qrow = (char *)dst;
         | 
| 11683 11995 | 
             
                for (int row = 0; row < nrow; ++row) {
         | 
| 11684 | 
            -
                    quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
         | 
| 11996 | 
            +
                    quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
         | 
| 11685 11997 | 
             
                    src += n_per_row;
         | 
| 11686 11998 | 
             
                    qrow += nblock*sizeof(block_iq1_s);
         | 
| 11687 11999 | 
             
                }
         | 
| 11688 12000 | 
             
                return nrow * nblock * sizeof(block_iq1_s);
         | 
| 11689 12001 | 
             
            }
         | 
| 11690 12002 |  | 
| 12003 | 
            +
            static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
         | 
| 12004 | 
            +
                    float    * scales,
         | 
| 12005 | 
            +
                    float    * weight,
         | 
| 12006 | 
            +
                    float    * pairs,
         | 
| 12007 | 
            +
                    int8_t   * L,
         | 
| 12008 | 
            +
                    uint16_t * index,
         | 
| 12009 | 
            +
                    int8_t   * shifts) {
         | 
| 12010 | 
            +
             | 
| 12011 | 
            +
                const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
         | 
| 12012 | 
            +
             | 
| 12013 | 
            +
                const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
         | 
| 12014 | 
            +
                const int      * kmap_q2xs       = iq2_data[gindex].map;
         | 
| 12015 | 
            +
                const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
         | 
| 12016 | 
            +
             | 
| 12017 | 
            +
                //GGML_ASSERT(quant_weights   && "missing quantization weights");
         | 
| 12018 | 
            +
                GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
         | 
| 12019 | 
            +
                GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
         | 
| 12020 | 
            +
                GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
         | 
| 12021 | 
            +
                GGML_ASSERT(n%QK_K == 0);
         | 
| 12022 | 
            +
             | 
| 12023 | 
            +
                block_iq1_m * y = vy;
         | 
| 12024 | 
            +
             | 
| 12025 | 
            +
                const int nbl = n/QK_K;
         | 
| 12026 | 
            +
             | 
| 12027 | 
            +
                const int block_size = IQ1M_BLOCK_SIZE;
         | 
| 12028 | 
            +
             | 
| 12029 | 
            +
                const float x_p[3] = {-1 + IQ1M_DELTA,  IQ1M_DELTA, 1 + IQ1M_DELTA};
         | 
| 12030 | 
            +
                const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
         | 
| 12031 | 
            +
                const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
         | 
| 12032 | 
            +
             | 
| 12033 | 
            +
                int * idx = (int *)(pairs + 1);
         | 
| 12034 | 
            +
             | 
| 12035 | 
            +
                float sumqx[4], sumq2[4];
         | 
| 12036 | 
            +
             | 
| 12037 | 
            +
                iq1m_scale_t s;
         | 
| 12038 | 
            +
                const float * xx;
         | 
| 12039 | 
            +
             | 
| 12040 | 
            +
                for (int ibl = 0; ibl < nbl; ++ibl) {
         | 
| 12041 | 
            +
             | 
| 12042 | 
            +
            #if QK_K == 64
         | 
| 12043 | 
            +
                    y[ibl].d = GGML_FP32_TO_FP16(0.f);
         | 
| 12044 | 
            +
            #endif
         | 
| 12045 | 
            +
                    memset(y[ibl].qs, 0, QK_K/8);
         | 
| 12046 | 
            +
                    memset(y[ibl].qh, 0, QK_K/16);
         | 
| 12047 | 
            +
                    memset(y[ibl].scales, 0, QK_K/32);
         | 
| 12048 | 
            +
             | 
| 12049 | 
            +
                    float max_scale = 0;
         | 
| 12050 | 
            +
             | 
| 12051 | 
            +
                    const float * xbl = x + QK_K*ibl;
         | 
| 12052 | 
            +
                    float sumx2 = 0;
         | 
| 12053 | 
            +
                    for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
         | 
| 12054 | 
            +
                    float sigma2 = 2*sumx2/QK_K;
         | 
| 12055 | 
            +
             | 
| 12056 | 
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         | 
| 12057 | 
            +
                        const float * xb = xbl + block_size*ib;
         | 
| 12058 | 
            +
                        if (quant_weights) {
         | 
| 12059 | 
            +
                            const float * qw = quant_weights + QK_K*ibl + block_size*ib;
         | 
| 12060 | 
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
         | 
| 12061 | 
            +
                        } else {
         | 
| 12062 | 
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
         | 
| 12063 | 
            +
                        }
         | 
| 12064 | 
            +
                        float max = fabsf(xb[0]);
         | 
| 12065 | 
            +
                        for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
         | 
| 12066 | 
            +
                        if (!max) {
         | 
| 12067 | 
            +
                            scales[ib] = 0;
         | 
| 12068 | 
            +
                            memset(L, 1, block_size);
         | 
| 12069 | 
            +
                            continue;
         | 
| 12070 | 
            +
                        }
         | 
| 12071 | 
            +
                        // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
         | 
| 12072 | 
            +
                        // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
         | 
| 12073 | 
            +
                        // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
         | 
| 12074 | 
            +
                        // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
         | 
| 12075 | 
            +
                        // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
         | 
| 12076 | 
            +
                        // for each possible and score for each split.
         | 
| 12077 | 
            +
                        for (int j = 0; j < block_size; ++j) {
         | 
| 12078 | 
            +
                            pairs[2*j] = xb[j];
         | 
| 12079 | 
            +
                            idx[2*j] = j;
         | 
| 12080 | 
            +
                        }
         | 
| 12081 | 
            +
                        qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
         | 
| 12082 | 
            +
                        float best_score = 0, scale = max;
         | 
| 12083 | 
            +
                        int besti1 = -1, besti2 = -1, best_k = -1;
         | 
| 12084 | 
            +
                        // 0: +, +
         | 
| 12085 | 
            +
                        // 1: +, -
         | 
| 12086 | 
            +
                        // 2: -, +
         | 
| 12087 | 
            +
                        // 3: -, -
         | 
| 12088 | 
            +
                        for (int i1 = 0; i1 <= block_size; ++i1) {
         | 
| 12089 | 
            +
                            for (int i2 = i1; i2 <= block_size; ++i2) {
         | 
| 12090 | 
            +
                                memset(sumqx, 0, 4*sizeof(float));
         | 
| 12091 | 
            +
                                memset(sumq2, 0, 4*sizeof(float));
         | 
| 12092 | 
            +
                                for (int j = 0; j < i1; ++j) {
         | 
| 12093 | 
            +
                                    int i = idx[2*j];
         | 
| 12094 | 
            +
                                    if (i < block_size/2) {
         | 
| 12095 | 
            +
                                        sumqx[0] += weight[i]*x_p[0]*xb[i];
         | 
| 12096 | 
            +
                                        sumqx[1] += weight[i]*x_p[0]*xb[i];
         | 
| 12097 | 
            +
                                        sumqx[2] += weight[i]*x_m[0]*xb[i];
         | 
| 12098 | 
            +
                                        sumqx[3] += weight[i]*x_m[0]*xb[i];
         | 
| 12099 | 
            +
                                        sumq2[0] += weight[i]*x_p[0]*x_p[0];
         | 
| 12100 | 
            +
                                        sumq2[1] += weight[i]*x_p[0]*x_p[0];
         | 
| 12101 | 
            +
                                        sumq2[2] += weight[i]*x_m[0]*x_m[0];
         | 
| 12102 | 
            +
                                        sumq2[3] += weight[i]*x_m[0]*x_m[0];
         | 
| 12103 | 
            +
                                    } else {
         | 
| 12104 | 
            +
                                        sumqx[0] += weight[i]*x_p[0]*xb[i];
         | 
| 12105 | 
            +
                                        sumqx[2] += weight[i]*x_p[0]*xb[i];
         | 
| 12106 | 
            +
                                        sumqx[1] += weight[i]*x_m[0]*xb[i];
         | 
| 12107 | 
            +
                                        sumqx[3] += weight[i]*x_m[0]*xb[i];
         | 
| 12108 | 
            +
                                        sumq2[0] += weight[i]*x_p[0]*x_p[0];
         | 
| 12109 | 
            +
                                        sumq2[2] += weight[i]*x_p[0]*x_p[0];
         | 
| 12110 | 
            +
                                        sumq2[1] += weight[i]*x_m[0]*x_m[0];
         | 
| 12111 | 
            +
                                        sumq2[3] += weight[i]*x_m[0]*x_m[0];
         | 
| 12112 | 
            +
                                    }
         | 
| 12113 | 
            +
                                }
         | 
| 12114 | 
            +
                                for (int j = i1; j < i2; ++j) {
         | 
| 12115 | 
            +
                                    int i = idx[2*j];
         | 
| 12116 | 
            +
                                    if (i < block_size/2) {
         | 
| 12117 | 
            +
                                        sumqx[0] += weight[i]*x_p[1]*xb[i];
         | 
| 12118 | 
            +
                                        sumqx[1] += weight[i]*x_p[1]*xb[i];
         | 
| 12119 | 
            +
                                        sumqx[2] += weight[i]*x_m[1]*xb[i];
         | 
| 12120 | 
            +
                                        sumqx[3] += weight[i]*x_m[1]*xb[i];
         | 
| 12121 | 
            +
                                        sumq2[0] += weight[i]*x_p[1]*x_p[1];
         | 
| 12122 | 
            +
                                        sumq2[1] += weight[i]*x_p[1]*x_p[1];
         | 
| 12123 | 
            +
                                        sumq2[2] += weight[i]*x_m[1]*x_m[1];
         | 
| 12124 | 
            +
                                        sumq2[3] += weight[i]*x_m[1]*x_m[1];
         | 
| 12125 | 
            +
                                    } else {
         | 
| 12126 | 
            +
                                        sumqx[0] += weight[i]*x_p[1]*xb[i];
         | 
| 12127 | 
            +
                                        sumqx[2] += weight[i]*x_p[1]*xb[i];
         | 
| 12128 | 
            +
                                        sumqx[1] += weight[i]*x_m[1]*xb[i];
         | 
| 12129 | 
            +
                                        sumqx[3] += weight[i]*x_m[1]*xb[i];
         | 
| 12130 | 
            +
                                        sumq2[0] += weight[i]*x_p[1]*x_p[1];
         | 
| 12131 | 
            +
                                        sumq2[2] += weight[i]*x_p[1]*x_p[1];
         | 
| 12132 | 
            +
                                        sumq2[1] += weight[i]*x_m[1]*x_m[1];
         | 
| 12133 | 
            +
                                        sumq2[3] += weight[i]*x_m[1]*x_m[1];
         | 
| 12134 | 
            +
                                    }
         | 
| 12135 | 
            +
                                }
         | 
| 12136 | 
            +
                                for (int j = i2; j < block_size; ++j) {
         | 
| 12137 | 
            +
                                    int i = idx[2*j];
         | 
| 12138 | 
            +
                                    if (i < block_size/2) {
         | 
| 12139 | 
            +
                                        sumqx[0] += weight[i]*x_p[2]*xb[i];
         | 
| 12140 | 
            +
                                        sumqx[1] += weight[i]*x_p[2]*xb[i];
         | 
| 12141 | 
            +
                                        sumqx[2] += weight[i]*x_m[2]*xb[i];
         | 
| 12142 | 
            +
                                        sumqx[3] += weight[i]*x_m[2]*xb[i];
         | 
| 12143 | 
            +
                                        sumq2[0] += weight[i]*x_p[2]*x_p[2];
         | 
| 12144 | 
            +
                                        sumq2[1] += weight[i]*x_p[2]*x_p[2];
         | 
| 12145 | 
            +
                                        sumq2[2] += weight[i]*x_m[2]*x_m[2];
         | 
| 12146 | 
            +
                                        sumq2[3] += weight[i]*x_m[2]*x_m[2];
         | 
| 12147 | 
            +
                                    } else {
         | 
| 12148 | 
            +
                                        sumqx[0] += weight[i]*x_p[2]*xb[i];
         | 
| 12149 | 
            +
                                        sumqx[2] += weight[i]*x_p[2]*xb[i];
         | 
| 12150 | 
            +
                                        sumqx[1] += weight[i]*x_m[2]*xb[i];
         | 
| 12151 | 
            +
                                        sumqx[3] += weight[i]*x_m[2]*xb[i];
         | 
| 12152 | 
            +
                                        sumq2[0] += weight[i]*x_p[2]*x_p[2];
         | 
| 12153 | 
            +
                                        sumq2[2] += weight[i]*x_p[2]*x_p[2];
         | 
| 12154 | 
            +
                                        sumq2[1] += weight[i]*x_m[2]*x_m[2];
         | 
| 12155 | 
            +
                                        sumq2[3] += weight[i]*x_m[2]*x_m[2];
         | 
| 12156 | 
            +
                                    }
         | 
| 12157 | 
            +
                                }
         | 
| 12158 | 
            +
                                for (int k = 0; k < 4; ++k) {
         | 
| 12159 | 
            +
                                    if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
         | 
| 12160 | 
            +
                                        scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
         | 
| 12161 | 
            +
                                        besti1 = i1; besti2 = i2; best_k = k;
         | 
| 12162 | 
            +
                                    }
         | 
| 12163 | 
            +
                                }
         | 
| 12164 | 
            +
                            }
         | 
| 12165 | 
            +
                        }
         | 
| 12166 | 
            +
                        GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
         | 
| 12167 | 
            +
                        for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
         | 
| 12168 | 
            +
                        for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
         | 
| 12169 | 
            +
                        for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
         | 
| 12170 | 
            +
                        if (scale < 0) {
         | 
| 12171 | 
            +
                            for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
         | 
| 12172 | 
            +
                            scale = -scale;
         | 
| 12173 | 
            +
                            best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
         | 
| 12174 | 
            +
                        }
         | 
| 12175 | 
            +
                        bool all_on_grid = true;
         | 
| 12176 | 
            +
                        for (int k = 0; k < block_size/8; ++k) {
         | 
| 12177 | 
            +
                            if (k == 0) xx = best_k < 2 ? x_p : x_m;
         | 
| 12178 | 
            +
                            else xx = best_k%2 == 0 ? x_p : x_m;
         | 
| 12179 | 
            +
                            uint16_t u = 0;
         | 
| 12180 | 
            +
                            for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
         | 
| 12181 | 
            +
                            int grid_index = kmap_q2xs[u];
         | 
| 12182 | 
            +
                            if (grid_index < 0) {
         | 
| 12183 | 
            +
                                all_on_grid = false;
         | 
| 12184 | 
            +
                                const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
         | 
| 12185 | 
            +
                                grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
         | 
| 12186 | 
            +
                                GGML_ASSERT(grid_index >= 0);
         | 
| 12187 | 
            +
                            }
         | 
| 12188 | 
            +
                            index[k] = grid_index;
         | 
| 12189 | 
            +
                        }
         | 
| 12190 | 
            +
                        if (!all_on_grid) {
         | 
| 12191 | 
            +
                            float sumqx_f = 0, sumq2_f = 0;
         | 
| 12192 | 
            +
                            for (int k = 0; k < block_size/8; ++k) {
         | 
| 12193 | 
            +
                                if (k == 0) xx = best_k < 2 ? x_p : x_m;
         | 
| 12194 | 
            +
                                else xx = best_k%2 == 0 ? x_p : x_m;
         | 
| 12195 | 
            +
                                const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
         | 
| 12196 | 
            +
                                for (int j = 0; j < 8; ++j) {
         | 
| 12197 | 
            +
                                    float w = weight[8*k + j];
         | 
| 12198 | 
            +
                                    float q = xx[(pg[j] - 1)/2];
         | 
| 12199 | 
            +
                                    sumqx_f += w*q*xb[8*k+j];
         | 
| 12200 | 
            +
                                    sumq2_f += w*q*q;
         | 
| 12201 | 
            +
                                }
         | 
| 12202 | 
            +
                            }
         | 
| 12203 | 
            +
                            if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
         | 
| 12204 | 
            +
                        }
         | 
| 12205 | 
            +
                        y[ibl].qs[2*ib + 0] = index[0] & 255;
         | 
| 12206 | 
            +
                        y[ibl].qs[2*ib + 1] = index[1] & 255;
         | 
| 12207 | 
            +
                        y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
         | 
| 12208 | 
            +
                        GGML_ASSERT(scale >= 0);
         | 
| 12209 | 
            +
                        scales[ib] = scale;
         | 
| 12210 | 
            +
                        shifts[ib] = best_k;
         | 
| 12211 | 
            +
                        max_scale = MAX(max_scale, scale);
         | 
| 12212 | 
            +
                    }
         | 
| 12213 | 
            +
             | 
| 12214 | 
            +
                    if (!max_scale) {
         | 
| 12215 | 
            +
                        continue;
         | 
| 12216 | 
            +
                    }
         | 
| 12217 | 
            +
             | 
| 12218 | 
            +
                    uint16_t * sc = (uint16_t *)y[ibl].scales;
         | 
| 12219 | 
            +
            #if QK_K == 64
         | 
| 12220 | 
            +
                    float d = max_scale/31;
         | 
| 12221 | 
            +
            #else
         | 
| 12222 | 
            +
                    float d = max_scale/15;
         | 
| 12223 | 
            +
            #endif
         | 
| 12224 | 
            +
                    float id = 1/d;
         | 
| 12225 | 
            +
                    float sumqx_f = 0, sumq2_f = 0;
         | 
| 12226 | 
            +
                    for (int ib = 0; ib < QK_K/block_size; ++ib) {
         | 
| 12227 | 
            +
                        int l = nearest_int(0.5f*(id*scales[ib+0]-1));
         | 
| 12228 | 
            +
            #if QK_K == 64
         | 
| 12229 | 
            +
                        l = MAX(0, MIN(15, l));
         | 
| 12230 | 
            +
                        sc[ib/4] |= (l << 4*(ib%4));
         | 
| 12231 | 
            +
            #else
         | 
| 12232 | 
            +
                        l = MAX(0, MIN(7, l));
         | 
| 12233 | 
            +
                        sc[ib/4] |= (l << 3*(ib%4));
         | 
| 12234 | 
            +
            #endif
         | 
| 12235 | 
            +
                        y[ibl].qh[ib] |= masks[shifts[ib]];
         | 
| 12236 | 
            +
                        const float * xb = xbl + block_size*ib;
         | 
| 12237 | 
            +
                        if (quant_weights) {
         | 
| 12238 | 
            +
                            const float * qw = quant_weights + QK_K*ibl + block_size*ib;
         | 
| 12239 | 
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
         | 
| 12240 | 
            +
                        } else {
         | 
| 12241 | 
            +
                            for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
         | 
| 12242 | 
            +
                        }
         | 
| 12243 | 
            +
                        for (int k = 0; k < block_size/8; ++k) {
         | 
| 12244 | 
            +
                            if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
         | 
| 12245 | 
            +
                            else xx = shifts[ib]%2 == 0 ? x_p : x_m;
         | 
| 12246 | 
            +
                            const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
         | 
| 12247 | 
            +
                            for (int j = 0; j < 8; ++j) {
         | 
| 12248 | 
            +
                                float w = weight[8*k + j];
         | 
| 12249 | 
            +
                                float q = xx[(pg[j] - 1)/2]*(2*l+1);
         | 
| 12250 | 
            +
                                sumqx_f += w*q*xb[8*k+j];
         | 
| 12251 | 
            +
                                sumq2_f += w*q*q;
         | 
| 12252 | 
            +
                            }
         | 
| 12253 | 
            +
                        }
         | 
| 12254 | 
            +
                    }
         | 
| 12255 | 
            +
                    if (sumq2_f > 0) d = sumqx_f/sumq2_f;
         | 
| 12256 | 
            +
                    s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
         | 
| 12257 | 
            +
            #if QK_K == 64
         | 
| 12258 | 
            +
                    y[ibl].d = s.f16;
         | 
| 12259 | 
            +
            #else
         | 
| 12260 | 
            +
                    sc[0] |= ((s.u16 & 0x000f) << 12);
         | 
| 12261 | 
            +
                    sc[1] |= ((s.u16 & 0x00f0) <<  8);
         | 
| 12262 | 
            +
                    sc[2] |= ((s.u16 & 0x0f00) <<  4);
         | 
| 12263 | 
            +
                    sc[3] |= ((s.u16 & 0xf000) <<  0);
         | 
| 12264 | 
            +
            #endif
         | 
| 12265 | 
            +
                }
         | 
| 12266 | 
            +
            }
         | 
| 12267 | 
            +
             | 
| 12268 | 
            +
            size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
         | 
| 12269 | 
            +
                GGML_ASSERT(n_per_row%QK_K == 0);
         | 
| 12270 | 
            +
                float  scales[QK_K/IQ1M_BLOCK_SIZE];
         | 
| 12271 | 
            +
                float  weight[IQ1M_BLOCK_SIZE];
         | 
| 12272 | 
            +
                int8_t L[IQ1M_BLOCK_SIZE];
         | 
| 12273 | 
            +
                float  pairs[2*IQ1M_BLOCK_SIZE];
         | 
| 12274 | 
            +
                uint16_t index[IQ1M_BLOCK_SIZE/8];
         | 
| 12275 | 
            +
                int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
         | 
| 12276 | 
            +
                int nblock = n_per_row/QK_K;
         | 
| 12277 | 
            +
                char * qrow = (char *)dst;
         | 
| 12278 | 
            +
                for (int row = 0; row < nrow; ++row) {
         | 
| 12279 | 
            +
                    quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
         | 
| 12280 | 
            +
                    src += n_per_row;
         | 
| 12281 | 
            +
                    qrow += nblock*sizeof(block_iq1_m);
         | 
| 12282 | 
            +
                }
         | 
| 12283 | 
            +
                return nrow * nblock * sizeof(block_iq1_m);
         | 
| 12284 | 
            +
            }
         | 
| 12285 | 
            +
             | 
| 11691 12286 | 
             
            // ============================ 4-bit non-linear quants
         | 
| 11692 12287 |  | 
| 11693 12288 | 
             
            static inline int best_index_int8(int n, const int8_t * val, float x) {
         |