npm - @novastera-oss/llamarn - Versions diffs - 0.0.1-alpha.4 - Mend

@novastera-oss/llamarn 0.0.1-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (989) hide show

package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl ADDED Viewed

@@ -0,0 +1,190 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    half d;             // super-block scale
+} block_q6_K;
+//------------------------------------------------------------------------------
+// kernel_mul_mv_q6_K_f32
+//------------------------------------------------------------------------------
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+#ifdef INTEL_GPU
+#define N_DST 1 // number of rows each SIMD group works on
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // SIMD group size
+#elif defined (ADRENO_GPU)
+#define N_DST 1
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 64
+#endif
+#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q6_K_f32(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+    uchar kmask1 = 0x03;
+    uchar kmask2 = 0x0C;
+    uchar kmask3 = 0x30;
+    uchar kmask4 = 0xC0;
+    int nb = ne00/QK_K;
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+    int row = N_SIMDGROUP * r0 + get_sub_group_id();
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+    ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
+    global float      * yy = (global float     *) src1 + r1*ne10 + im*ne00*ne1;
+    float sumf = 0;
+    // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
+    // block. Values in a subblock shares a scale that is quantized with 8 bits;
+    // the entire block shares a single floating point scale.
+    // For work distribution, each thread processes a subblock (16 weights), hence
+    // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
+    // (super) blocks -- this is the block stride.
+    // The 16 threads that process a (super) block are split into 2 portions, each has
+    // 8 threads; each portion works on 8 subblocks.
+    // For subgroup of 16 threads, the entire subgroup works on a single (super) block
+    // before moving to the next (super) block. Thread0 - thread7 work on the
+    // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
+    // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
+    // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
+    // works on a total of 16 weight values.
+    int tid  = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
+    int ix   = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+    int ip   = tid/8;   // first or second half of (super) block (0 or 1)
+    int il   = tid%8;   // each half has 8 parts, one per scale
+    int n    = 4;       // 4 scales at a time (and 4 sums)
+    int l0   = n*il;    // offset into half-block, 0..28
+    int is   = 8*ip + l0/16; // 0, 1, 8, 9
+    int y_offset = 128*ip + l0;
+    int q_offset_l = 64*ip + l0;
+    int q_offset_h = 32*ip + l0;
+    for (int i = ix; i < nb; i += BLOCK_STRIDE) {
+        global uint8_t * q1 = x[i].ql + q_offset_l;
+        global uint8_t * q2 = q1 + QK_K/8;
+        global uint8_t * qh = x[i].qh + q_offset_h;
+        global int8_t  * sc = x[i].scales + is;
+        global float * y = yy + i * QK_K + y_offset;
+        float dall = x[i].d;
+        float4 sums = {0.f, 0.f, 0.f, 0.f};
+        sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[0+64] * ((float)((q1[0]  >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[0+96] * ((float)((q2[0]  >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
+        sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[1+64] * ((float)((q1[1]  >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[1+96] * ((float)((q2[1]  >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
+        sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[2+64] * ((float)((q1[2]  >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[2+96] * ((float)((q2[2]  >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
+        sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[3+64] * ((float)((q1[3]  >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[3+96] * ((float)((q2[3]  >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
+        sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
+    }
+    float tot = sub_group_reduce_add(sumf);
+    if (get_sub_group_local_id() == 0) {
+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
+    }
+}

package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl ADDED Viewed

@@ -0,0 +1,81 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+//------------------------------------------------------------------------------
+// norm
+//------------------------------------------------------------------------------
+kernel void kernel_norm(
+        global void * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        float eps,
+        local float * sum
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    dst = (global void*)((global char*)dst + offsetd);
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+    global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    // MEAN
+    // parallel sum
+    sum[get_local_id(0)] = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        sum[get_local_id(0)] += x[i00];
+    }
+    // reduce
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+        if (get_local_id(0) < i) {
+            sum[get_local_id(0)] += sum[get_local_id(0) + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    float mean  = sum[0] / ne00;
+    // recenter and VARIANCE
+    barrier(CLK_LOCAL_MEM_FENCE);
+    global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    sum[get_local_id(0)] = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        y[i00] = x[i00] - mean;
+        sum[get_local_id(0)] += y[i00] * y[i00];
+    }
+    // reduce
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+        if (get_local_id(0) < i) {
+            sum[get_local_id(0)] += sum[get_local_id(0) + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    float variance = sum[0] / ne00;
+    float scale = 1.0f/sqrt(variance + eps);
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        y[i00] = y[i00] * scale;
+    }
+}

package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/relu.cl ADDED Viewed

@@ -0,0 +1,16 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+//------------------------------------------------------------------------------
+// relu
+//------------------------------------------------------------------------------
+kernel void kernel_relu(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+    dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
+}

package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl ADDED Viewed

@@ -0,0 +1,96 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+//------------------------------------------------------------------------------
+// rms_norm
+//------------------------------------------------------------------------------
+// This kernel depends on subgroup size.
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_32
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_rms_norm(
+        global void * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        float eps,
+        local float * sum // Note, the size depends on number of subgroups
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+    global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    global float * x_scalar = (global float *) x;
+    float4 sumf = 0;
+    float all_sum = 0;
+    // parallel sum
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        sumf += x[i00] * x[i00];
+    }
+    all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
+    all_sum = sub_group_reduce_add(all_sum);
+    if (get_sub_group_local_id() == 0) {
+        sum[get_sub_group_id()] = all_sum;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // broadcast
+    for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
+       if (get_local_id(0) < i) {
+           sum[get_local_id(0)] += sum[get_local_id(0) + i];
+       }
+    }
+    if (get_local_id(0) == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
+            sum[0] += x_scalar[i];
+        }
+        sum[0] /= ne00;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    const float mean  = sum[0];
+    const float scale = 1.0f/sqrt(mean + eps);
+    global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    global float * y_scalar = (global float *) y;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        y[i00] = x[i00] * scale;
+    }
+    if (get_local_id(0) == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
+            y_scalar[i00] = x_scalar[i00] * scale;
+        }
+    }
+}