npm - whisper.rn - Versions diffs - 0.4.2 → 0.5.0-rc.0 - Mend

whisper.rn 0.4.2 → 0.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/cpp/ggml-cpu/ops.cpp CHANGED Viewed

@@ -3,6 +3,7 @@
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "binary-ops.h"
+#include "ggml.h"
 #include "unary-ops.h"
 #include "vec.h"
@@ -108,7 +109,7 @@ static void wsp_ggml_compute_forward_dup_f16(
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                             for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = WSP_GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                dst_ptr[id] = WSP_GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
                                 id++;
                             }
                         }
@@ -130,7 +131,7 @@ static void wsp_ggml_compute_forward_dup_f16(
                             const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                             for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = WSP_GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                src0_f32[i00] = WSP_GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
                             }
                             wsp_quantize_row_q(src0_f32, dst_ptr + id, ne00);
@@ -156,7 +157,7 @@ static void wsp_ggml_compute_forward_dup_f16(
                             for (int i00 = 0; i00 < ne00; i00++) {
                                 const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                                dst_ptr[id] = WSP_GGML_FP16_TO_FP32(*src0_ptr);
+                                dst_ptr[id] = WSP_GGML_CPU_FP16_TO_FP32(*src0_ptr);
                                 id++;
                             }
                         }
@@ -267,7 +268,7 @@ static void wsp_ggml_compute_forward_dup_f16(
                         const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                        *(float *) dst_ptr = WSP_GGML_FP16_TO_FP32(*(const wsp_ggml_fp16_t *) src0_ptr);
+                        *(float *) dst_ptr = WSP_GGML_CPU_FP16_TO_FP32(*(const wsp_ggml_fp16_t *) src0_ptr);
                         if (++i10 == ne0) {
                             i10 = 0;
@@ -372,7 +373,7 @@ static void wsp_ggml_compute_forward_dup_bf16(
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const wsp_ggml_bf16_t * src0_ptr = (wsp_ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                             for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = WSP_GGML_FP32_TO_FP16(WSP_GGML_BF16_TO_FP32(src0_ptr[i00]));
+                                dst_ptr[id] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_BF16_TO_FP32(src0_ptr[i00]));
                                 id++;
                             }
                         }
@@ -473,7 +474,7 @@ static void wsp_ggml_compute_forward_dup_bf16(
                             for (int i00 = 0; i00 < ne00; i00++) {
                                 const wsp_ggml_bf16_t * src0_ptr = (wsp_ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                                dst_ptr[id] = WSP_GGML_FP32_TO_FP16(WSP_GGML_BF16_TO_FP32(*src0_ptr));
+                                dst_ptr[id] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_BF16_TO_FP32(*src0_ptr));
                                 id++;
                             }
                         }
@@ -566,7 +567,7 @@ static void wsp_ggml_compute_forward_dup_bf16(
                         const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                        *(wsp_ggml_fp16_t *) dst_ptr = WSP_GGML_FP32_TO_FP16(WSP_GGML_BF16_TO_FP32(*(const wsp_ggml_bf16_t *) src0_ptr));
+                        *(wsp_ggml_fp16_t *) dst_ptr = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_BF16_TO_FP32(*(const wsp_ggml_bf16_t *) src0_ptr));
                         if (++i10 == ne0) {
                             i10 = 0;
@@ -696,24 +697,8 @@ static void wsp_ggml_compute_forward_dup_f32(
     if (wsp_ggml_is_contiguous(dst)) {
         // TODO: simplify
         if (nb00 == sizeof(float)) {
-            if (dst->type == WSP_GGML_TYPE_F32) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (wsp_ggml_get_type_traits_cpu(dst->type)->from_float) {
-                wsp_ggml_from_float_t const wsp_quantize_row_q = wsp_ggml_get_type_traits_cpu(dst->type)->from_float;
+            if (wsp_ggml_get_type_traits_cpu(dst->type)->from_float) {
+                wsp_ggml_from_float_t const from_float = wsp_ggml_get_type_traits_cpu(dst->type)->from_float;
                 size_t id = 0;
                 size_t rs = nb0 * (ne00 / wsp_ggml_blck_size(dst->type));
@@ -724,7 +709,7 @@ static void wsp_ggml_compute_forward_dup_f32(
                         id += rs * ir0;
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            wsp_quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            from_float(src0_ptr, dst_ptr + id, ne00);
                             id += rs;
                         }
                         id += rs * (ne01 - ir1);
@@ -765,7 +750,7 @@ static void wsp_ggml_compute_forward_dup_f32(
                             for (int i00 = 0; i00 < ne00; i00++) {
                                 const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                                dst_ptr[id] = WSP_GGML_FP32_TO_FP16(*src0_ptr);
+                                dst_ptr[id] = WSP_GGML_CPU_FP32_TO_FP16(*src0_ptr);
                                 id++;
                             }
                         }
@@ -878,7 +863,7 @@ static void wsp_ggml_compute_forward_dup_f32(
                         const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                        *(wsp_ggml_fp16_t *) dst_ptr = WSP_GGML_FP32_TO_FP16(*(const float *) src0_ptr);
+                        *(wsp_ggml_fp16_t *) dst_ptr = WSP_GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr);
                         if (++i10 == ne0) {
                             i10 = 0;
@@ -1419,7 +1404,7 @@ static void wsp_ggml_compute_forward_add1_f16_f32(
         wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
         wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
         for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+            dst_ptr[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
         }
     }
 }
@@ -1435,7 +1420,7 @@ static void wsp_ggml_compute_forward_add1_f16_f16(
     WSP_GGML_ASSERT(wsp_ggml_is_scalar(src1));
     // scalar to add
-    const float v = WSP_GGML_FP16_TO_FP32(*(wsp_ggml_fp16_t *) src1->data);
+    const float v = WSP_GGML_CPU_FP16_TO_FP32(*(wsp_ggml_fp16_t *) src1->data);
     const int ith = params->ith;
     const int nth = params->nth;
@@ -1467,7 +1452,7 @@ static void wsp_ggml_compute_forward_add1_f16_f16(
         wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
         wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
         for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+            dst_ptr[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
         }
     }
 }
@@ -1889,7 +1874,7 @@ static void wsp_ggml_compute_forward_sum_f16(
             }
         }
     }
-    ((wsp_ggml_fp16_t *) dst->data)[0] = WSP_GGML_FP32_TO_FP16(sum);
+    ((wsp_ggml_fp16_t *) dst->data)[0] = WSP_GGML_CPU_FP32_TO_FP16(sum);
 }
 static void wsp_ggml_compute_forward_sum_bf16(
@@ -2300,6 +2285,12 @@ void wsp_ggml_compute_forward_repeat(
             {
                 wsp_ggml_compute_forward_repeat_f32(params, dst);
             } break;
+        // TODO: templateify the implemenation and support for I64
+        //       ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
+        //case WSP_GGML_TYPE_I64:
+        //    {
+        //        wsp_ggml_compute_forward_repeat_i64(params, dst);
+        //    } break;
         default:
             {
                 WSP_GGML_ABORT("fatal error");
@@ -2660,7 +2651,7 @@ static void wsp_ggml_compute_forward_gelu_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = WSP_GGML_FP16_TO_FP32(x);
+            const float v = WSP_GGML_CPU_FP16_TO_FP32(x);
             WSP_GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -2763,7 +2754,7 @@ static void wsp_ggml_compute_forward_gelu_erf_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = WSP_GGML_FP16_TO_FP32(x);
+            const float v = WSP_GGML_CPU_FP16_TO_FP32(x);
             WSP_GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -2866,7 +2857,7 @@ static void wsp_ggml_compute_forward_gelu_quick_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = WSP_GGML_FP16_TO_FP32(x);
+            const float v = WSP_GGML_CPU_FP16_TO_FP32(x);
             WSP_GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -2969,7 +2960,7 @@ static void wsp_ggml_compute_forward_silu_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
-            const float v = WSP_GGML_FP16_TO_FP32(x);
+            const float v = WSP_GGML_CPU_FP16_TO_FP32(x);
             WSP_GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -3163,7 +3154,7 @@ static void wsp_ggml_compute_forward_silu_back_f16(
     #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const float x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = WSP_GGML_FP16_TO_FP32(x);
+            const float v = WSP_GGML_CPU_FP16_TO_FP32(x);
             WSP_GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -3194,6 +3185,435 @@ void wsp_ggml_compute_forward_silu_back(
     }
 }
+// wsp_ggml_compute_forward_reglu
+static void wsp_ggml_compute_forward_reglu_f32(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    char * src0_d = (char *) src0->data;
+    char * src1_d = (char *) (src1 ? src1->data : src0->data);
+    const size_t src0_o = src0->nb[1];
+    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(dst));
+    if (src1) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src1));
+        WSP_GGML_ASSERT(src0->type == src1->type);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
+    const int nr = wsp_ggml_nrows(src0);
+    WSP_GGML_ASSERT(dst->ne[0] == nc);
+    WSP_GGML_ASSERT(wsp_ggml_nrows(dst) == nr);
+    const int32_t swapped = wsp_ggml_get_op_params_i32(dst, 1);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * src0_p = (float *) (src0_d + i1*src0_o);
+        float * src1_p = (float *) (src1_d + i1*src1_o);
+        if (!src1) {
+            src0_p += swapped ? nc : 0;
+            src1_p += swapped ? 0 : nc;
+        }
+        wsp_ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            WSP_GGML_UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+static void wsp_ggml_compute_forward_reglu_f16(
+    const wsp_ggml_compute_params * params,
+    wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    char * src0_d = (char *) src0->data;
+    char * src1_d = (char *) (src1 ? src1->data : src0->data);
+    const size_t src0_o = src0->nb[1];
+    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(dst));
+    if (src1) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src1));
+        WSP_GGML_ASSERT(src0->type == src1->type);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
+    const int nr = wsp_ggml_nrows(src0);
+    WSP_GGML_ASSERT(dst->ne[0] == nc);
+    WSP_GGML_ASSERT(wsp_ggml_nrows(dst) == nr);
+    const int32_t swapped = wsp_ggml_get_op_params_i32(dst, 1);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        wsp_ggml_fp16_t * src0_p = (wsp_ggml_fp16_t *) (src0_d + i1*src0_o);
+        wsp_ggml_fp16_t * src1_p = (wsp_ggml_fp16_t *) (src1_d + i1*src1_o);
+        if (!src1) {
+            src0_p += swapped ? nc : 0;
+            src1_p += swapped ? 0 : nc;
+        }
+        wsp_ggml_vec_reglu_f16(nc, (wsp_ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float v = WSP_GGML_FP16_TO_FP32(x);
+            WSP_GGML_UNUSED(v);
+            assert(!isnan(v));
+            assert(!isinf(v));
+        }
+#endif
+    }
+}
+static void wsp_ggml_compute_forward_reglu(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_reglu_f32(params, dst);
+            } break;
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_reglu_f16(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("fatal error");
+            }
+    }
+}
+// wsp_ggml_compute_forward_geglu
+static void wsp_ggml_compute_forward_geglu_f32(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    char * src0_d = (char *) src0->data;
+    char * src1_d = (char *) (src1 ? src1->data : src0->data);
+    const size_t src0_o = src0->nb[1];
+    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(dst));
+    if (src1) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src1));
+        WSP_GGML_ASSERT(src0->type == src1->type);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
+    const int nr = wsp_ggml_nrows(src0);
+    WSP_GGML_ASSERT(dst->ne[0] == nc);
+    WSP_GGML_ASSERT(wsp_ggml_nrows(dst) == nr);
+    const int32_t swapped = wsp_ggml_get_op_params_i32(dst, 1);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * src0_p = (float *) (src0_d + i1*src0_o);
+        float * src1_p = (float *) (src1_d + i1*src1_o);
+        if (!src1) {
+            src0_p += swapped ? nc : 0;
+            src1_p += swapped ? 0 : nc;
+        }
+        wsp_ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            WSP_GGML_UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+static void wsp_ggml_compute_forward_geglu_f16(
+    const wsp_ggml_compute_params * params,
+    wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    char * src0_d = (char *) src0->data;
+    char * src1_d = (char *) (src1 ? src1->data : src0->data);
+    const size_t src0_o = src0->nb[1];
+    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(dst));
+    if (src1) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src1));
+        WSP_GGML_ASSERT(src0->type == src1->type);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
+    const int nr = wsp_ggml_nrows(src0);
+    WSP_GGML_ASSERT(dst->ne[0] == nc);
+    WSP_GGML_ASSERT(wsp_ggml_nrows(dst) == nr);
+    const int32_t swapped = wsp_ggml_get_op_params_i32(dst, 1);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        wsp_ggml_fp16_t * src0_p = (wsp_ggml_fp16_t *) (src0_d + i1*src0_o);
+        wsp_ggml_fp16_t * src1_p = (wsp_ggml_fp16_t *) (src1_d + i1*src1_o);
+        if (!src1) {
+            src0_p += swapped ? nc : 0;
+            src1_p += swapped ? 0 : nc;
+        }
+        wsp_ggml_vec_geglu_f16(nc, (wsp_ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float v = WSP_GGML_FP16_TO_FP32(x);
+            WSP_GGML_UNUSED(v);
+            assert(!isnan(v));
+            assert(!isinf(v));
+        }
+#endif
+    }
+}
+static void wsp_ggml_compute_forward_geglu(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_geglu_f32(params, dst);
+            } break;
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_geglu_f16(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("fatal error");
+            }
+    }
+}
+// wsp_ggml_compute_forward_swiglu
+static void wsp_ggml_compute_forward_swiglu_f32(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    char * src0_d = (char *) src0->data;
+    char * src1_d = (char *) (src1 ? src1->data : src0->data);
+    const size_t src0_o = src0->nb[1];
+    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(dst));
+    if (src1) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src1));
+        WSP_GGML_ASSERT(src0->type == src1->type);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
+    const int nr = wsp_ggml_nrows(src0);
+    WSP_GGML_ASSERT(dst->ne[0] == nc);
+    WSP_GGML_ASSERT(wsp_ggml_nrows(dst) == nr);
+    const int32_t swapped = wsp_ggml_get_op_params_i32(dst, 1);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * src0_p = (float *) (src0_d + i1*src0_o);
+        float * src1_p = (float *) (src1_d + i1*src1_o);
+        if (!src1) {
+            src0_p += swapped ? nc : 0;
+            src1_p += swapped ? 0 : nc;
+        }
+        wsp_ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            WSP_GGML_UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+static void wsp_ggml_compute_forward_swiglu_f16(
+    const wsp_ggml_compute_params * params,
+    wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    char * src0_d = (char *) src0->data;
+    char * src1_d = (char *) (src1 ? src1->data : src0->data);
+    const size_t src0_o = src0->nb[1];
+    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(dst));
+    if (src1) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src1));
+        WSP_GGML_ASSERT(src0->type == src1->type);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
+    const int nr = wsp_ggml_nrows(src0);
+    WSP_GGML_ASSERT(dst->ne[0] == nc);
+    WSP_GGML_ASSERT(wsp_ggml_nrows(dst) == nr);
+    const int32_t swapped = wsp_ggml_get_op_params_i32(dst, 1);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        wsp_ggml_fp16_t * src0_p = (wsp_ggml_fp16_t *) (src0_d + i1*src0_o);
+        wsp_ggml_fp16_t * src1_p = (wsp_ggml_fp16_t *) (src1_d + i1*src1_o);
+        if (!src1) {
+            src0_p += swapped ? nc : 0;
+            src1_p += swapped ? 0 : nc;
+        }
+        wsp_ggml_vec_swiglu_f16(nc, (wsp_ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const wsp_ggml_fp16_t x = ((wsp_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float v = WSP_GGML_FP16_TO_FP32(x);
+            WSP_GGML_UNUSED(v);
+            assert(!isnan(v));
+            assert(!isinf(v));
+        }
+#endif
+    }
+}
+static void wsp_ggml_compute_forward_swiglu(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_swiglu_f32(params, dst);
+            } break;
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_swiglu_f16(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("fatal error");
+            }
+    }
+}
 // wsp_ggml_compute_forward_norm
 static void wsp_ggml_compute_forward_norm_f32(
@@ -4470,6 +4890,74 @@ void wsp_ggml_compute_forward_get_rows(
     //}
 }
+static void wsp_ggml_compute_forward_set_rows_f32(
+        const wsp_ggml_compute_params * params,
+              wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
+    const int64_t nc = ne00;
+    const int64_t nr = ne01;
+    assert(ne0  == nc);
+    assert(ne2  == ne02);
+    assert(ne3  == ne03);
+    assert(src0->type == WSP_GGML_TYPE_F32);
+    assert(ne02 % ne11 == 0);
+    assert(ne03 % ne12 == 0);
+    const int ith = params->ith;
+    const int nth = params->nth;
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = std::min(ir0 + dr, nr);
+    wsp_ggml_from_float_t const from_float = wsp_ggml_get_type_traits_cpu(dst->type)->from_float;
+    for (int64_t i03 = 0; i03 < ne03; ++i03) {
+        for (int64_t i02 = 0; i02 < ne02; ++i02) {
+            for (int64_t i = ir0; i < ir1; ++i) {
+                const int64_t i12 = i03%ne12;
+                const int64_t i11 = i02%ne11;
+                const int64_t i10 = i;
+                const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+                WSP_GGML_ASSERT(i1 >= 0 && i1 < ne1);
+                from_float(
+                        (const float *) ((char *) src0->data +  i*nb01 + i02*nb02 + i03*nb03),
+                                        ((char *)  dst->data + i1*nb1  + i02*nb2  + i03*nb3), nc);
+            }
+        }
+    }
+}
+void wsp_ggml_compute_forward_set_rows(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_set_rows_f32(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("src0->type = %d (%s) not supported", src0->type, wsp_ggml_type_name(src0->type));
+            }
+    }
+}
 // wsp_ggml_compute_forward_get_rows_back
 static void wsp_ggml_compute_forward_get_rows_back_f32_f16(
@@ -4500,7 +4988,7 @@ static void wsp_ggml_compute_forward_get_rows_back_f32_f16(
         for (int j = 0; j < nc; ++j) {
             wsp_ggml_fp16_t v = ((wsp_ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
-            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += WSP_GGML_FP16_TO_FP32(v);
+            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += WSP_GGML_CPU_FP16_TO_FP32(v);
         }
     }
 }
@@ -4792,7 +5280,7 @@ static void wsp_ggml_compute_forward_soft_max_f32(
         if (mp_f32) {
             if (use_f16) {
                 for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*WSP_GGML_FP16_TO_FP32(mp_f16[i]);
+                    wp[i] += slope*WSP_GGML_CPU_FP16_TO_FP32(mp_f16[i]);
                 }
             } else {
                 for (int i = 0; i < nc; ++i) {
@@ -5018,8 +5506,8 @@ static void wsp_ggml_compute_forward_clamp_f16(
         wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + j*nb01);
         for (int i = 0; i < nc; i++) {
-            float v = WSP_GGML_FP16_TO_FP32(src0_ptr[i]);
-            dst_ptr[i] = WSP_GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
+            float v = WSP_GGML_CPU_FP16_TO_FP32(src0_ptr[i]);
+            dst_ptr[i] = WSP_GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min));
         }
     }
 }
@@ -5476,11 +5964,11 @@ static void wsp_ggml_compute_forward_rope_f16(
                             const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
                             wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
-                            const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims]);
+                            const float x0 = WSP_GGML_CPU_FP16_TO_FP32(src[0]);
+                            const float x1 = WSP_GGML_CPU_FP16_TO_FP32(src[n_dims]);
-                            dst_data[0]      = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                            dst_data[0]      = WSP_GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims] = WSP_GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                         }
                     } else {
                         for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
@@ -5492,11 +5980,11 @@ static void wsp_ggml_compute_forward_rope_f16(
                             const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
                             wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
-                            const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims/2]);
+                            const float x0 = WSP_GGML_CPU_FP16_TO_FP32(src[0]);
+                            const float x1 = WSP_GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
-                            dst_data[0]        = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims/2] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                            dst_data[0]        = WSP_GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = WSP_GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                         }
                     }
                 } else {
@@ -5507,11 +5995,11 @@ static void wsp_ggml_compute_forward_rope_f16(
                         const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-                        const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = WSP_GGML_FP16_TO_FP32(src[1]);
+                        const float x0 = WSP_GGML_CPU_FP16_TO_FP32(src[0]);
+                        const float x1 = WSP_GGML_CPU_FP16_TO_FP32(src[1]);
-                        dst_data[0] = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[0] = WSP_GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = WSP_GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
                 }
@@ -5525,11 +6013,11 @@ static void wsp_ggml_compute_forward_rope_f16(
                         const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
                         wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
-                        const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims]);
+                        const float x0 = WSP_GGML_CPU_FP16_TO_FP32(src[0]);
+                        const float x1 = WSP_GGML_CPU_FP16_TO_FP32(src[n_dims]);
-                        dst_data[0]      = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[0]      = WSP_GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims] = WSP_GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
                 } else {
                     for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
@@ -5640,7 +6128,7 @@ static void wsp_ggml_compute_forward_conv_transpose_1d_f16_f32(
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = WSP_GGML_FP32_TO_FP16(src[i10]);
+                    dst_data[i10*ne11 + i11] = WSP_GGML_CPU_FP32_TO_FP16(src[i10]);
                 }
             }
         }
@@ -5933,7 +6421,7 @@ static void wsp_ggml_compute_forward_im2col_f16(
                                 if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
                                     dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
                                 } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = WSP_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = WSP_GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                 }
                             }
                         }
@@ -6058,6 +6546,186 @@ void wsp_ggml_compute_forward_im2col_back_f32(
     }
 }
+static void wsp_ggml_call_mul_mat(wsp_ggml_type type, const wsp_ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
+                              void * a, void * b, float * c) {
+    const wsp_ggml_type_traits * traits = wsp_ggml_get_type_traits(type);
+    struct wsp_ggml_tensor src1 = {};
+    src1.type  = type;
+    src1.ne[0] = k;
+    src1.ne[1] = m;
+    src1.ne[2] = 1;
+    src1.ne[3] = 1;
+    src1.nb[0] = traits->type_size;
+    src1.nb[1] = k * traits->type_size;
+    src1.nb[2] = src1.nb[1];
+    src1.nb[3] = src1.nb[2];
+    src1.data  = a;
+    struct wsp_ggml_tensor src0 = {};
+    src0.type  = type;
+    src0.ne[0] = k;
+    src0.ne[1] = n;
+    src0.ne[2] = 1;
+    src0.ne[3] = 1;
+    src0.nb[0] = traits->type_size;
+    src0.nb[1] = k * traits->type_size;
+    src0.nb[2] = src0.nb[1];
+    src0.nb[3] = src0.nb[2];
+    src0.data  = b;
+    struct wsp_ggml_tensor dst = {};
+    dst.ne[0] = n;
+    dst.ne[1] = m;
+    dst.ne[2] = 1;
+    dst.ne[3] = 1;
+    dst.nb[0] = sizeof(float);
+    dst.nb[1] = n * sizeof(float);
+    dst.nb[2] = dst.nb[1];
+    dst.nb[3] = dst.nb[2];
+    dst.data  = c;
+    dst.src[0] = &src0;
+    dst.src[1] = &src1;
+    wsp_ggml_compute_forward_mul_mat(params, &dst);
+}
+// wsp_ggml_compute_forward_conv_2d
+static void wsp_ggml_compute_forward_conv_2d_impl(const wsp_ggml_compute_params * params,
+                                              const wsp_ggml_tensor *         kernel,  // [KW, KH, IC, OC]
+                                              const wsp_ggml_tensor *         src,     // [W, H, C, N]
+                                              wsp_ggml_tensor *               dst,     // [OW, OH, OC, N]
+                                              wsp_ggml_type                   kernel_type) {
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(kernel));
+    WSP_GGML_ASSERT(kernel_type == WSP_GGML_TYPE_F16 || kernel_type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT(kernel->type == kernel_type);
+    const wsp_ggml_type_traits * traits = wsp_ggml_get_type_traits(kernel_type);
+    const int32_t stride_x   = dst->op_params[0];
+    const int32_t stride_y   = dst->op_params[1];
+    const int32_t pad_x      = dst->op_params[2];
+    const int32_t pad_y      = dst->op_params[3];
+    const int32_t dilation_x = dst->op_params[4];
+    const int32_t dilation_y = dst->op_params[5];
+    const int64_t c_in  = src->ne[2];
+    const int64_t c_out = kernel->ne[3];
+    WSP_GGML_ASSERT(c_in == kernel->ne[2]);
+    const int64_t src_w = src->ne[0];
+    const int64_t src_h = src->ne[1];
+    const int64_t knl_w = kernel->ne[0];
+    const int64_t knl_h = kernel->ne[1];
+    const int64_t dst_w = dst->ne[0];
+    const int64_t dst_h = dst->ne[1];
+    const float * src_data = (float *) src->data;
+    void  * knl_data       = kernel->data;
+    float * dst_data       = (float *) dst->data;
+    const int64_t knl_n           = knl_w * knl_h * c_in;
+    const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
+    const int64_t space_per_patch   = knl_n * traits->type_size + c_out * sizeof(float);
+    const int64_t batch_size        = params->wsize / space_per_patch;
+    const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
+    const int64_t batch_n           = (patch_total + patches_per_batch - 1) / patches_per_batch;
+    WSP_GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
+    void * tmp = params->wdata;
+    for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
+        const int64_t patch_start_batch = batch_i * patches_per_batch;
+        const int64_t patch_end_batch   = std::min(patch_start_batch + patches_per_batch,
+                                              patch_total);
+        const int64_t patch_n           = patch_end_batch - patch_start_batch;
+        const int64_t patch_per_thread  = (patch_n + params->nth - 1) / params->nth;
+        const int64_t patch_start       = patch_start_batch + params->ith * patch_per_thread;
+        const int64_t patch_end         = std::min(patch_start + patch_per_thread, patch_end_batch);
+        //im2col for a patch
+        for (int64_t p = patch_start; p < patch_end; ++p) {
+            const int64_t  batch_n     =  p / (dst_w * dst_h);
+            const int64_t  src_x       = (p / dst_w) % dst_h;
+            const int64_t  src_y       =  p % dst_w;
+            const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
+            char *        dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
+            for (int64_t ic = 0; ic < c_in; ++ic) {
+                for (int64_t ky = 0; ky < knl_h; ++ky) {
+                    for (int64_t kx = 0; kx < knl_w; ++kx) {
+                        const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
+                        const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
+                        int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
+                        float src_val;
+                        if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
+                            src_val = 0.0f;
+                        } else {
+                            const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            src_val               = *src_ptr;
+                        }
+                        char * element_ptr = dst_row + dst_idx * traits->type_size;
+                        if (kernel_type == WSP_GGML_TYPE_F32) {
+                            *(float *) element_ptr = src_val;
+                        } else if (kernel_type == WSP_GGML_TYPE_F16) {
+                            *(wsp_ggml_fp16_t *) element_ptr = WSP_GGML_CPU_FP32_TO_FP16(src_val);
+                        }
+                    }
+                }
+            }
+        }   // patches handled by this thread
+        wsp_ggml_barrier(params->threadpool);
+        float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size);
+        WSP_GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize);
+        // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
+        wsp_ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output);
+        wsp_ggml_barrier(params->threadpool);
+        //permute back [OC, N, OH, OW] to [N, OC, OH, OW]
+        const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
+        const int64_t permute_start = params->ith * permute_per_thread;
+        const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
+        for (int64_t i = permute_start; i < permute_end; ++i) {
+            const int64_t p       = patch_start_batch + i;
+            const int64_t batch_n = p / (dst_w * dst_h);
+            const int64_t dst_y   = (p / dst_w) % dst_h;
+            const int64_t dst_x   = p % dst_w;
+            for (int64_t oc = 0; oc < c_out; ++oc) {
+                const float value = gemm_output[i * c_out + oc];
+                float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
+                *dst_ptr = value;
+            }
+        }
+    }
+}
+void wsp_ggml_compute_forward_conv_2d(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    const wsp_ggml_tensor * src1 = dst->src[1];
+    wsp_ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
+}
 // wsp_ggml_compute_forward_conv_transpose_2d
 void wsp_ggml_compute_forward_conv_transpose_2d(
@@ -6109,7 +6777,7 @@ void wsp_ggml_compute_forward_conv_transpose_2d(
                     const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
                     wsp_ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
                     for (int i10 = 0; i10 < ne10; i10++) {
-                        dst_data[i10*ne12 + i12] = WSP_GGML_FP32_TO_FP16(src[i10]);
+                        dst_data[i10*ne12 + i12] = WSP_GGML_CPU_FP32_TO_FP16(src[i10]);
                     }
                 }
             }
@@ -6358,7 +7026,7 @@ static void wsp_ggml_compute_forward_pool_1d_sk_p0(
                 case WSP_GGML_OP_POOL_COUNT: WSP_GGML_ABORT("fatal error");
             }
             for (int ki = 0; ki < k; ++ki) {
-                const float srow_j = (src->type == WSP_GGML_TYPE_F32) ? ((const float*)srow)[j] : WSP_GGML_FP16_TO_FP32(((const wsp_ggml_fp16_t*)srow)[j]);
+                const float srow_j = (src->type == WSP_GGML_TYPE_F32) ? ((const float*)srow)[j] : WSP_GGML_CPU_FP16_TO_FP32(((const wsp_ggml_fp16_t*)srow)[j]);
                 switch (op) {
                     case WSP_GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
                     case WSP_GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
@@ -6450,7 +7118,7 @@ void wsp_ggml_compute_forward_pool_2d(
                     for (int kx = 0; kx < k0; ++kx) {
                         int j = ix + kx;
                         if (j < 0 || j >= src->ne[0]) continue;
-                        const float srow_j = (src->type == WSP_GGML_TYPE_F32) ? ((const float*)srow)[j] : WSP_GGML_FP16_TO_FP32(((const wsp_ggml_fp16_t*)srow)[j]);
+                        const float srow_j = (src->type == WSP_GGML_TYPE_F32) ? ((const float*)srow)[j] : WSP_GGML_CPU_FP16_TO_FP32(((const wsp_ggml_fp16_t*)srow)[j]);
                         switch (op) {
                             case WSP_GGML_OP_POOL_AVG:                     *out += srow_j; break;
                             case WSP_GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
@@ -6538,7 +7206,7 @@ void wsp_ggml_compute_forward_pool_2d_back(
                             }
                             const float val = dst->type == WSP_GGML_TYPE_F32 ?
-                                ((const float *) drowf)[j] : WSP_GGML_FP16_TO_FP32(((const wsp_ggml_fp16_t *) drowf)[j]);
+                                ((const float *) drowf)[j] : WSP_GGML_CPU_FP16_TO_FP32(((const wsp_ggml_fp16_t *) drowf)[j]);
                             if (val <= maxval) {
                                 continue;
                             }
@@ -6558,7 +7226,7 @@ void wsp_ggml_compute_forward_pool_2d_back(
                     if (dst->type == WSP_GGML_TYPE_F32) {
                         ((float *) drow)[j] += grad0;
                     } else {
-                        ((wsp_ggml_fp16_t *) drow)[j] = WSP_GGML_FP32_TO_FP16(grad0 + WSP_GGML_FP16_TO_FP32(((const wsp_ggml_fp16_t *) drow)[j]));
+                        ((wsp_ggml_fp16_t *) drow)[j] = WSP_GGML_CPU_FP32_TO_FP16(grad0 + WSP_GGML_CPU_FP16_TO_FP32(((const wsp_ggml_fp16_t *) drow)[j]));
                     }
                 } else if (op == WSP_GGML_OP_POOL_AVG) {
                     const float grad = grad0 / ka;
@@ -6577,7 +7245,7 @@ void wsp_ggml_compute_forward_pool_2d_back(
                             if (dst->type == WSP_GGML_TYPE_F32) {
                                 ((float *) drow)[j] += grad;
                             } else {
-                                ((wsp_ggml_fp16_t *) drow)[j] += WSP_GGML_FP32_TO_FP16(grad);
+                                ((wsp_ggml_fp16_t *) drow)[j] += WSP_GGML_CPU_FP32_TO_FP16(grad);
                             }
                         }
                     }
@@ -6608,12 +7276,13 @@ static void wsp_ggml_compute_forward_upscale_f32(
     WSP_GGML_TENSOR_UNARY_OP_LOCALS
-    const float sf0 = (float)ne0/src0->ne[0];
-    const float sf1 = (float)ne1/src0->ne[1];
-    const float sf2 = (float)ne2/src0->ne[2];
-    const float sf3 = (float)ne3/src0->ne[3];
+    float sf0 = (float)ne0/src0->ne[0];
+    float sf1 = (float)ne1/src0->ne[1];
+    float sf2 = (float)ne2/src0->ne[2];
+    float sf3 = (float)ne3/src0->ne[3];
-    const wsp_ggml_scale_mode mode = (wsp_ggml_scale_mode) wsp_ggml_get_op_params_i32(dst, 0);
+    const int32_t mode_flags = wsp_ggml_get_op_params_i32(dst, 0);
+    const wsp_ggml_scale_mode mode = (wsp_ggml_scale_mode) (mode_flags & 0xFF);
     if (mode == WSP_GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -6634,8 +7303,12 @@ static void wsp_ggml_compute_forward_upscale_f32(
             }
         }
     } else if (mode == WSP_GGML_SCALE_MODE_BILINEAR) {
-        // setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True
-        const float pixel_offset = 0.5f;
+        float pixel_offset = 0.5f;
+        if (mode_flags & WSP_GGML_SCALE_FLAG_ALIGN_CORNERS) {
+            pixel_offset = 0.0f;
+            sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
+            sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
+        }
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
@@ -7142,7 +7815,7 @@ static void wsp_ggml_compute_forward_flash_attn_ext_f16(
         // loop over n_kv and n_head_kv
         // ref: https://arxiv.org/pdf/2112.05682.pdf
         for (int64_t ic = 0; ic < nek1; ++ic) {
-            const float mv = mp ? slope*WSP_GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
+            const float mv = mp ? slope*WSP_GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
             if (mv == -INFINITY) {
                 continue;
             }
@@ -7210,7 +7883,7 @@ static void wsp_ggml_compute_forward_flash_attn_ext_f16(
         if (v->type == WSP_GGML_TYPE_F16) {
             for (int64_t d = 0; d < DV; ++d) {
-                VKQ32[d] = WSP_GGML_FP16_TO_FP32(VKQ16[d]);
+                VKQ32[d] = WSP_GGML_CPU_FP16_TO_FP32(VKQ16[d]);
             }
         }
@@ -7994,6 +8667,34 @@ void wsp_ggml_compute_forward_unary(
     }
 }
+//wsp_ggml_compute_forward_glu
+void wsp_ggml_compute_forward_glu(
+        const wsp_ggml_compute_params * params,
+        wsp_ggml_tensor * dst) {
+    const wsp_ggml_glu_op op = wsp_ggml_get_glu_op(dst);
+    switch (op) {
+        case WSP_GGML_GLU_OP_REGLU:
+            {
+                wsp_ggml_compute_forward_reglu(params, dst);
+            } break;
+        case WSP_GGML_GLU_OP_GEGLU:
+            {
+                wsp_ggml_compute_forward_geglu(params, dst);
+            } break;
+        case WSP_GGML_GLU_OP_SWIGLU:
+            {
+                wsp_ggml_compute_forward_swiglu(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("fatal error");
+            }
+    }
+}
 // wsp_ggml_compute_forward_get_rel_pos
 static void wsp_ggml_compute_forward_get_rel_pos_f16(