npm - @novastera-oss/llamarn - Versions diffs - 0.2.7 → 0.3.0 - Mend

@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (319) hide show

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp DELETED Viewed

@@ -1,52 +0,0 @@
-#version 450
-#include "common.comp"
-layout(local_size_x = 1024) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb00;
-    int nb01;
-    int nb02;
-    int nb03;
-    int ne10;
-    int ne11;
-    int ne12;
-    int ne13;
-    int nb10;
-    int nb11;
-    int nb12;
-    int nb13;
-    int ne0;
-    int nb0;
-    int nb1;
-    int nb2;
-    int nb3;
-} pcs;
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-    const uint i13 = i03 % pcs.ne13;
-    const uint i12 = i02 % pcs.ne12;
-    const uint i11 = i01 % pcs.ne11;
-    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
-    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
-    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1)  / 4);
-    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
-        const uint i10 = i0 % pcs.ne10;
-        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
-    }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp DELETED Viewed

@@ -1,69 +0,0 @@
-#version 450
-#include "common.comp"
-#extension GL_KHR_shader_subgroup_arithmetic : require
-layout(local_size_x_id = 0) in;
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne10;
-    int ne11;
-    int ne12;
-    uint nb10;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    int ne0;
-    int ne1;
-    uint r2;
-    uint r3;
-} pcs;
-#define N_F16_F32 4
-void main() {
-    const uint r0 = gl_WorkGroupID.x;
-    const uint rb = gl_WorkGroupID.y*N_F16_F32;
-    const uint im = gl_WorkGroupID.z;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
-    const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
-    for (uint row = 0; row < N_F16_F32; ++row) {
-        uint r1 = rb + row;
-        if (r1 >= pcs.ne11) {
-            break;
-        }
-        const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-        float sumf = 0;
-        for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-            sumf += float(inA[x+i]) * float(inB[y+i]);
-        }
-        const float all_sum = subgroupAdd(sumf);
-        if (subgroupElect()) {
-            out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
-        }
-    }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp DELETED Viewed

@@ -1,51 +0,0 @@
-#version 450
-#include "common.comp"
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-// device subgroup size
-layout (local_size_x_id = 0) in;
-layout(binding = 0) readonly buffer tensorInA { float inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-void main() {
-  uvec3 gid = gl_WorkGroupID;
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-      sum += float(inA[x+i]) * float(inB[y+i]);
-  }
-  const float all_sum = subgroupAdd(sum);
-  if (subgroupElect()) {
-    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
-  }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp DELETED Viewed

@@ -1,33 +0,0 @@
-#version 450
-#include "common.comp"
-#define BLOCKS_IN_QUANT QK4_0
-#define SIZE_OF_BLOCK sizeof_block_q4_0
-#define N_ROWS 4
-#include "op_mul_mv_q_n_pre.comp"
-// The q4_0 version of this function
-float block_q_n_dot_y(uint block_index, uint yb, uint il) {
-    vec2 acc = vec2(0.0, 0.0);
-    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
-    float d = float(u8BufToFloat16(inA, index));
-    float sumy = 0.0f;
-    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
-        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
-        const float yl0 = inB[yb + i];
-        const float yl1 = inB[yb + i + 1];
-        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
-        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-        sumy += yl0 + yl1 + yl8 + yl9;
-        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
-        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
-    }
-    return d * (sumy * -8.f + acc[0] + acc[1]);
-}
-#include "op_mul_mv_q_n.comp"

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp DELETED Viewed

@@ -1,35 +0,0 @@
-#version 450
-#include "common.comp"
-#define BLOCKS_IN_QUANT QK4_1
-#define SIZE_OF_BLOCK sizeof_block_q4_1
-#define N_ROWS 4
-#include "op_mul_mv_q_n_pre.comp"
-// The q4_1 version of this function
-float block_q_n_dot_y(uint block_index, uint yb, uint il) {
-    vec2 acc = vec2(0.0, 0.0);
-    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
-    float d = float(u8BufToFloat16(inA, index));
-    float m = float(u8BufToFloat16(inA, index+2));
-    float sumy = 0.0f;
-    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
-        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
-        const float yl0 = inB[yb + i];
-        const float yl1 = inB[yb + i + 1];
-        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
-        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
-        sumy += yl0 + yl1 + yl8 + yl9;
-        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
-        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-#include "op_mul_mv_q_n.comp"

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp DELETED Viewed

@@ -1,140 +0,0 @@
-#version 450
-#include "common.comp"
-#define N_DST 4
-#define SIZE_OF_BLOCK sizeof_block_q4_k
-layout(local_size_x = 4) in;
-layout(local_size_y = 8) in;
-layout(local_size_z = 1) in;
-layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int ne02;
-    int ne12;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    uint r2;
-    uint r3;
-} pcs;
-void main() {
-    const uint16_t kmask1 = uint16_t(0x3f3f);
-    const uint16_t kmask2 = uint16_t(0x0f0f);
-    const uint16_t kmask3 = uint16_t(0xc0c0);
-    const uint ix = gl_SubgroupInvocationID/8;  // 0...3
-    const uint it = gl_SubgroupInvocationID%8;  // 0...7
-    const uint iq = it/4;     // 0 or 1
-    const uint ir = it%4;     // 0...3
-    const uint nb = pcs.ne00/QK_K;
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint first_row = r0 * N_DST;
-    const uint ib_row = first_row * nb;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-    const uint offset1 =        r1*pcs.nb11 + (i12       )*pcs.nb12 + (i13       )*pcs.nb13;
-    const uint xblk = offset0 + pcs.inAOff;
-    const uint y = (offset1 / 4) + pcs.inBOff;
-    float yl[16];
-    float yh[16];
-    float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
-    float all_sum = 0.f;
-    uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
-    for (uint ib = ix; ib < nb; ib += 4) {
-        const uint blk_idx = ib + xblk;
-        float sumy[4] = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+0] = inB[y4+i+  0]; sumy[0] += yl[i+0];
-            yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
-            yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
-            yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
-        }
-        for (int row = 0; row < N_DST; row++) {
-            uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
-            uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
-            uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
-            uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
-            uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
-            uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
-            uint16_t sc16[4];
-            sc16[0] = sc_0 & kmask1;
-            sc16[1] = sc_2 & kmask1;
-            sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
-            sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
-            float acc1[4] = {0.f, 0.f, 0.f, 0.f};
-            float acc2[4] = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
-                uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
-                acc1[0] += yl[i+0] * (q1 & 0x000F);
-                acc1[1] += yl[i+1] * (q1 & 0x0F00);
-                acc1[2] += yl[i+8] * (q1 & 0x00F0);
-                acc1[3] += yl[i+9] * (q1 & 0xF000);
-                acc2[0] += yh[i+0] * (q2 & 0x000F);
-                acc2[1] += yh[i+1] * (q2 & 0x0F00);
-                acc2[2] += yh[i+8] * (q2 & 0x00F0);
-                acc2[3] += yh[i+9] * (q2 & 0xF000);
-            }
-            uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
-            uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
-            uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
-            uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
-            uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
-            uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
-            uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
-            uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
-            float dall = float(inA[blk_idx + row_idx].d);
-            float dmin = float(inA[blk_idx + row_idx].dmin);
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
-                               (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
-                               (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
-                               (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
-                dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
-        }
-        y4 += 4 * QK_K;
-    }
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = subgroupAdd(sumf[row]);
-        if (subgroupElect()) {
-            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
-        }
-    }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp DELETED Viewed

@@ -1,106 +0,0 @@
-#version 450
-#include "common.comp"
-#define SIZE_OF_BLOCK sizeof_block_q6_k
-layout(local_size_x_id = 0) in;
-layout(local_size_y_id = 1) in;
-layout(local_size_z = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne10;
-    int ne0;
-    int ne1;
-    int ne01;
-    int ne02;
-    int ne12;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    uint r2;
-    uint r3;
-} pcs;
-void main() {
-    const uint8_t kmask1 = uint8_t(0x03);
-    const uint8_t kmask2 = uint8_t(0x0C);
-    const uint8_t kmask3 = uint8_t(0x30);
-    const uint8_t kmask4 = uint8_t(0xC0);
-    const uint nb = pcs.ne00/QK_K;
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-    const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-    float sumf = 0;
-    // bits of invocation ID for gl_SubgroupSize=32:
-    //  x   x   x   x   x
-    //  4   3   2   1   0
-    // (     tid     ) ix
-    //  ip (   il    )
-    const uint block_stride = gl_SubgroupSize / 16;         // number of blocks each subgroup processes
-    const uint tid  = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
-    const uint ix   = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
-    const uint ip   = tid/8;        // first or second half of block (0 or 1)
-    const uint il   = tid%8;        // each half has 8 parts, one per scale
-    const uint n    = 4;            // 4 scales at a time (and 4 sums)
-    const uint l0   = n*il;         // offset into half-block, 0..28
-    const uint is   = 8*ip + l0/16; // 0, 1, 8, 9
-    const uint y_offset = 128*ip + l0;
-    const uint q_offset_l = 64*ip + l0;
-    const uint q_offset_h = 32*ip + l0;
-    for (uint i = ix; i < nb; i += block_stride) {
-        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-        const uint qlIndex = q_offset_l;
-        const uint q2Index = qlIndex + QK_K/8;
-        const uint qhIndex = q_offset_h;
-        const uint y = yy + i * QK_K + y_offset;
-        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (uint l = 0; l < n; ++l) {
-            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
-            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
-            const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
-            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
-            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
-            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
-            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
-        }
-        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
-        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
-    }
-    const float tot = subgroupAdd(sumf);
-    if (subgroupElect()) {
-        out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
-    }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp DELETED Viewed

@@ -1,73 +0,0 @@
-#version 450
-#include "common.comp"
-#include "op_mul_mv_q_n_pre.comp"
-#define SIZE_OF_D 2
-#define N_DST 4 // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-#define NB_Q8_0 8
-void main() {
-    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
-    if (gl_SubgroupInvocationID > 31)
-        return;
-    const int nr  = N_DST;
-    const int nsg = N_SIMDGROUP;
-    const int nw  = N_SIMDWIDTH;
-    const int nb = pcs.ne00/QK8_0;
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
-    const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
-    const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
-    float yl[NB_Q8_0];
-    float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
-    const uint ix = gl_SubgroupInvocationID.x/4;
-    const uint il = gl_SubgroupInvocationID.x%4;
-    uint yb = y + ix * QK8_0 + NB_Q8_0*il;
-    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
-    for (uint ib = ix; ib < nb; ib += nw/4) {
-        for (int i = 0; i < NB_Q8_0; ++i) {
-            yl[i] = inB[yb + i];
-        }
-        for (int row = 0; row < nr; row++) {
-            const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
-            float sumq = 0.f;
-            for (int iq = 0; iq < NB_Q8_0; ++iq) {
-                const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
-                sumq += qs_iq * yl[iq];
-            }
-            const float16_t d = u8BufToFloat16(inA, x + block_offset);
-            sumf[row] += sumq*d;
-        }
-        yb += NB_Q8_0 * nw;
-    }
-    for (int row = 0; row < nr; ++row) {
-        const float tot = subgroupAdd(sumf[row]);
-        if (subgroupElect() && first_row + row < pcs.ne01) {
-            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
-        }
-    }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp DELETED Viewed

@@ -1,52 +0,0 @@
-void main() {
-    // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
-    if (gl_SubgroupInvocationID > 31)
-        return;
-    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-    const uint im = gl_WorkGroupID.z;
-    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
-    const uint i12 = im%pcs.ne12;
-    const uint i13 = im/pcs.ne12;
-    // pointers to src0 rows
-    uint ax[N_ROWS];
-    for (int row = 0; row < N_ROWS; ++row) {
-        const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
-        ax[row] = offset0 + pcs.inAOff;
-    }
-    const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
-    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
-    const uint ix = gl_SubgroupInvocationID/2;
-    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
-    uint yb = y + ix * BLOCKS_IN_QUANT + il;
-    //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
-    //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
-    //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
-    for (uint ib = ix; ib < nb; ib += 16) {
-        for (int row = 0; row < N_ROWS; row++) {
-            sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
-        }
-        yb += BLOCKS_IN_QUANT * 16;
-    }
-    for (int row = 0; row < N_ROWS; ++row) {
-        const float tot = subgroupAdd(sumf[row]);
-        if (first_row + row < pcs.ne01 && subgroupElect()) {
-            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
-        }
-    }
-}

package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp DELETED Viewed

@@ -1,28 +0,0 @@
-layout(local_size_x_id = 0) in;
-layout(local_size_y = 8) in;
-layout(local_size_z = 1) in;
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int  ne00;
-    int  ne01;
-    int  ne02;
-    int  ne10;
-    int  ne12;
-    int  ne0;
-    int  ne1;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    uint nb11;
-    uint nb12;
-    uint nb13;
-    uint r2;
-    uint r3;
-} pcs;