RubyGems - llama_cpp - Versions diffs - 0.15.2 → 0.15.3 - Mend

llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/ext/llama_cpp/llama_cpp.cpp +49 -0
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +4 -0
data/vendor/tmp/llama.cpp/Makefile +6 -17
data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
data/vendor/tmp/llama.cpp/ggml.c +376 -758
data/vendor/tmp/llama.cpp/ggml.h +39 -27
data/vendor/tmp/llama.cpp/llama.cpp +823 -593
data/vendor/tmp/llama.cpp/llama.h +10 -3
metadata +3 -3

data/vendor/tmp/llama.cpp/ggml-metal.metal CHANGED Viewed

@@ -1640,6 +1640,7 @@ static void rope_yarn_corr_dims(
 typedef void (rope_t)(
         device const    void * src0,
         device const int32_t * src1,
+        device const   float * src2,
         device         float * dst,
         constant     int64_t & ne00,
         constant     int64_t & ne01,
@@ -1675,6 +1676,7 @@ template<typename T>
 kernel void kernel_rope(
         device const    void * src0,
         device const int32_t * src1,
+        device const   float * src2,
         device         float * dst,
         constant     int64_t & ne00,
         constant     int64_t & ne01,
@@ -1744,8 +1746,10 @@ kernel void kernel_rope(
                 // simplified from `(ib * n_dims + ic) * inv_ndims`
                 const float cur_rot = inv_ndims*ic - ib;
+                const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
+                const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
-                const float theta = theta_0 * pow(freq_base, cur_rot);
                 float cos_theta, sin_theta;
                 rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
@@ -2204,11 +2208,7 @@ kernel void kernel_flash_attn_ext_f16(
         // pointer to the mask
         device const half * mp = (device const half *) (mask + iq1*nb31);
-        // prepare diagonal scale matrix
-        simdgroup_float8x8 mscale(scale);
-        // prepare diagonal slope matrix
-        simdgroup_float8x8 mslope(1.0f);
+        float slope = 1.0f;
         // ALiBi
         if (max_bias > 0.0f) {
@@ -2217,7 +2217,7 @@ kernel void kernel_flash_attn_ext_f16(
             const float base = h < n_head_log2 ? m0 : m1;
             const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-            mslope = simdgroup_float8x8(pow(base, exph));
+            slope = pow(base, exph);
         }
         // loop over the KV cache
@@ -2242,18 +2242,20 @@ kernel void kernel_flash_attn_ext_f16(
                         simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
                     }
+                    simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
+                    const short tx = tiisg%4;
+                    const short ty = tiisg/4;
                     if (mask != q) {
                         // mqk = mqk*scale + mask*slope
-                        simdgroup_half8x8 mm;
-                        simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
-                        simdgroup_multiply(mm, mslope, mm);
-                        simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
+                        ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
+                        ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
                     } else {
                         // mqk = mqk*scale
-                        simdgroup_multiply(mqk, mscale, mqk);
+                        ss[8*cc + ty*TF + 2*tx + 0] *= scale;
+                        ss[8*cc + ty*TF + 2*tx + 1] *= scale;
                     }
-                    simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
                 }
             }
@@ -2816,8 +2818,7 @@ kernel void kernel_cpy_f32_f16(
     for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
         device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-        // TODO: is there a better way to handle -INFINITY?
-        dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0];
+        dst_data[i00] = src[0];
     }
 }
@@ -3385,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl(
     const int step = sizeof(block_q2_K) * nb;
-#if QK_K == 256
     const int ix = tiisg/8;  // 0...3
     const int it = tiisg%8;  // 0...7
     const int iq = it/4;     // 0 or 1
@@ -3437,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl(
         y4 += 4 * QK_K;
     }
-#else
-    const int ix = tiisg/2;  // 0...15
-    const int it = tiisg%2;  // 0...1
-    device const float * y4 = y + ix * QK_K + 8 * it;
-    for (int ib = ix; ib < nb; ib += 16) {
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
-            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
-            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
-            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
-        }
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = &x[ib].d;
-        for (int row = 0; row < N_DST; row++) {
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
-            }
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
-                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
-            qs += step/2;
-            sc += step;
-            dh += step/2;
-        }
-        y4 += 16 * QK_K;
-    }
-#endif
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
@@ -3525,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32(
     kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
-#if QK_K == 256
 void kernel_mul_mv_q3_K_f32_impl(
         device const  void * src0,
         device const float * src1,
@@ -3684,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl(
         }
     }
 }
-#else
-void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int nb = ne00/QK_K;
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-    const int row = 2 * r0 + sgitg;
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-    const int ix = tiisg/4;
-    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-    float2 sum = {0.f, 0.f};
-    for (int i = ix; i < nb; i += 8) {
-        const float d_all = (float)(x[i].d);
-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
-        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
-        device const float    * y = yy + i * QK_K + il;
-        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
-        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
-        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
-        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
-        for (int l = 0; l < 4; l += 2) {
-            const uint16_t hm = h[l/2] >> iq;
-            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
-                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
-                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
-                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
-            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
-                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
-                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
-                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
-        }
-    }
-    const float sumf = sum[0] + sum[1] * 1.f/256.f;
-    const float tot = simd_sum(sumf);
-    if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
-    }
-}
-#endif
 [[host_name("kernel_mul_mv_q3_K_f32")]]
 kernel void kernel_mul_mv_q3_K_f32(
@@ -3791,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32(
     kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }
-#if QK_K == 256
 void kernel_mul_mv_q4_K_f32_impl(
         device const  void * src0,
         device const float * src1,
@@ -3905,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl(
         }
     }
 }
-#else
-void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int ix = tiisg/4;  // 0...7
-    const int it = tiisg%4;  // 0...3
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-    float yl[8];
-    float yh[8];
-    float sumf[N_DST]={0.f}, all_sum;
-    const int step = sizeof(block_q4_K) * nb / 2;
-    device const float * y4 = y + ix * QK_K + 8 * it;
-    uint16_t sc16[4];
-    for (int ib = ix; ib < nb; ib += 8) {
-        float2 sumy = {0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
-            yh[i] = y4[i+32]; sumy[1] += yh[i];
-        }
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = x[ib].d;
-        for (int row = 0; row < N_DST; row++) {
-            sc16[0] = sc[0] & 0x000f;
-            sc16[1] = sc[0] & 0x0f00;
-            sc16[2] = sc[0] & 0x00f0;
-            sc16[3] = sc[0] & 0xf000;
-            float2 acc1 = {0.f, 0.f};
-            float2 acc2 = {0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
-                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
-                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
-                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
-            }
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
-                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
-            qs += step;
-            sc += step;
-            dh += step;
-        }
-        y4 += 8 * QK_K;
-    }
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#endif
 [[host_name("kernel_mul_mv_q4_K_f32")]]
 kernel void kernel_mul_mv_q4_K_f32(
@@ -4069,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl(
     const int step = sizeof(block_q5_K) * nb;
-#if QK_K == 256
-#
     float yl[16], yh[16];
     const uint16_t kmask1 = 0x3f3f;
@@ -4153,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl(
         y1 += 4 * QK_K;
     }
-#else
-    float yl[8], yh[8];
-    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
-    const int ix = tiisg%8;
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-    device const float * y = yy + ix*QK_K + il;
-    for (int i = ix; i < nb; i += 8) {
-        for (int l = 0; l < 4; ++l) {
-            yl[l+0] = y[l+ 0];
-            yl[l+4] = y[l+16];
-            yh[l+0] = y[l+32];
-            yh[l+4] = y[l+48];
-        }
-        device const half * dh = &x[i].d;
-        device const uint8_t * q = x[i].qs + il;
-        device const uint8_t * h = x[i].qh + in;
-        device const int8_t  * s = x[i].scales;
-        for (int row = 0; row < 2; ++row) {
-            const float d = dh[0];
-            float2 acc = {0.f, 0.f};
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t hl = h[l] >> iq;
-                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
-                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
-                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
-                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
-            }
-            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
-            q += step;
-            h += step;
-            s += step;
-            dh += step/2;
-        }
-        y += 8 * QK_K;
-    }
-#endif
     for (int row = 0; row < 2; ++row) {
         const float tot = simd_sum(sumf[row]);
@@ -4279,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl(
     float sumf = 0;
-#if QK_K == 256
     const int tid  = tiisg/2;
     const int ix   = tiisg%2;
     const int ip   = tid/8;         // 0 or 1
@@ -4315,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl(
     }
-#else
-    const int ix  = tiisg/4;
-    const int il  = 4*(tiisg%4);
-    for (int i = ix; i < nb; i += 8) {
-        device const float * y = yy + i * QK_K + il;
-        device const uint8_t * ql = x[i].ql + il;
-        device const uint8_t * qh = x[i].qh + il;
-        device const int8_t  * s  = x[i].scales;
-        const float d = x[i].d;
-        float4 sums = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < 4; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
-            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
-        }
-        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
-    }
-#endif
     const float tot = simd_sum(sumf);
     if (tiisg == 0) {
         dst[r1*ne0 + im*ne0*ne1 + row] = tot;
@@ -5172,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
     device const float * y4 = y + 32 * ix;
-#if QK_K != 64
     iq1m_scale_t scale;
-#endif
     for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
@@ -5195,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
         device const uint16_t * sc = (device const uint16_t *)xr->scales;
         for (int row = 0; row < N_DST; row++) {
-#if QK_K != 64
             scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-#endif
             constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
             constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
@@ -5214,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
             }
             const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
             const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
-#if QK_K == 64
-            const float d = (float) *((device const half *)(sc - 1));
-            sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
-                              (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
-#else
             sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
                                              (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
-#endif
             sc += nb*sizeof(block_iq1_m)/2;
             qs += nb*sizeof(block_iq1_m);
@@ -5333,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
     }
 }
-#if QK_K != 64
 void kernel_mul_mv_iq4_xs_f32_impl(
         device const  void * src0,
         device const float * src1,
@@ -5428,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
         }
     }
 }
-#endif
 [[host_name("kernel_mul_mv_iq1_s_f32")]]
 kernel void kernel_mul_mv_iq1_s_f32(
@@ -5541,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
-#if QK_K == 64
-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-#else
     kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-#endif
 }
 //============================= templates and their specializations =============================
@@ -5671,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
     float dl, ml;
     uint8_t sc = xb->scales[il];
-#if QK_K == 256
     q = q + 32*(il/8) + 16*(il&1);
     il = (il/2)%4;
-#endif
     half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
     uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
     dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
@@ -5690,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
     device const uint8_t * h = (device const uint8_t *)xb->hmask;
     device const int8_t * scales = (device const int8_t *)xb->scales;
-#if QK_K == 256
     q = q + 32 * (il/8) + 16 * (il&1);
     h = h + 16 * (il&1);
     uint8_t m = 1 << (il/2);
@@ -5711,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
     for (int i = 0; i < 16; ++i) {
         reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
     }
-#else
-    float    kcoef = il&1 ? 1.f/16.f : 1.f;
-    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
-    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
-    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    uint8_t  m = 1<<(il*2);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
-    }
-#endif
 }
 static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
@@ -5733,7 +5401,6 @@ template <typename type4x4>
 void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
     device const uchar * q = xb->qs;
-#if QK_K == 256
     short is = (il/4) * 2;
     q = q + (il/4) * 32 + 16 * (il&1);
     il = il & 3;
@@ -5742,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
     const float min = xb->dmin;
     const float dl = d * sc[0];
     const float ml = min * sc[1];
-#else
-    (void) get_scale_min_k4_just2;
-    q = q + 16 * (il&1);
-    device const uint8_t * s = xb->scales;
-    device const half2 * dh = (device const half2 *)xb->d;
-    const float2 d = (float2)dh[0];
-    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
-    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
-#endif
     const ushort mask = il<2 ? 0x0F : 0xF0;
     for (int i = 0; i < 16; ++i) {
         reg[i/4][i%4] = dl * (q[i] & mask) - ml;
@@ -5763,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
     device const uint8_t * q  = xb->qs;
     device const uint8_t * qh = xb->qh;
-#if QK_K == 256
     short is = (il/4) * 2;
     q  = q + 32 * (il/4) + 16 * (il&1);
     qh = qh + 16 * (il&1);
@@ -5780,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
     for (int i = 0; i < 16; ++i) {
         reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
     }
-#else
-    q = q + 16 * (il&1);
-    device const int8_t * s = xb->scales;
-    const float dl = xb->d * s[il];
-    uint8_t m = 1<<(il*2);
-    const float  coef = il<2 ? 1.f  : 1.f/16.f;
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
-    }
-#endif
 }
 template <typename type4x4>
@@ -5800,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
     device const uint8_t * qh = (device const uint8_t *)xb->qh;
     device const int8_t * scales = (device const int8_t *)xb->scales;
-#if QK_K == 256
     ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
     qh = qh + 32*(il/8) + 16*(il&1);
     float sc = scales[(il%2) + 2 * ((il/2))];
     il = (il/2) & 3;
-#else
-    ql = ql + 16 * (il&1);
-    float sc = scales[il];
-#endif
     const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
     const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
     const float       coef = il>1 ? 1.f/16.f          : 1.f;
@@ -5965,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
     const int ib32 = il/2;
     il = il%2;
     device const uint16_t * sc = (device const uint16_t *)xb->scales;
-#if QK_K == 64
-    const float d = xb->d;
-#else
     iq1m_scale_t scale;
     scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
     const float d = scale.f16;
-#endif
     device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
     device const uint8_t * qh = xb->qh + 2*ib32 + il;
-#if QK_K == 64
-    const float dl  = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
-#else
     const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
-#endif
     const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
     const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
     constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
@@ -6008,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
 template <typename type4x4>
 void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
-#if QK_K == 64
-    dequantize_iq4_nl(xb, il, reg);
-#else
     // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
     const int ib32 = il/2;
     il = il%2;
@@ -6027,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
         reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
         reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
     }
-#endif
 }
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
@@ -6532,11 +6165,7 @@ kernel void kernel_mul_mm_id(
         sgitg);
 }
-#if QK_K == 256
 #define QK_NL 16
-#else
-#define QK_NL 4
-#endif
 //
 // get rows
@@ -6576,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_t kernel_get_r
 template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_get_rows_iq1_m")]]   kernel get_rows_t kernel_get_rows<block_iq1_m,   QK_NL, dequantize_iq1_m>;
 template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_t kernel_get_rows<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  2,     dequantize_iq4_xs>;
-#else
 template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif
 //
 // matrix-matrix multiplication
@@ -6608,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_m,   QK_NL, dequantize_iq1_m>;
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_xs>;
-#else
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif
 //
 // indirect matrix-matrix multiplication
@@ -6640,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel
 template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
 template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  2,     dequantize_iq4_xs>;
-#else
 template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif
 //
 // matrix-vector multiplication
@@ -6853,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq3_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
-#if QK_K != 64
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
-#endif