npm - whisper.rn - Versions diffs - 0.4.0-rc.5 → 0.4.0-rc.6 - Mend

whisper.rn 0.4.0-rc.5 → 0.4.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/android/src/main/java/com/rnwhisper/WhisperContext.java +7 -2
package/android/src/main/jni.cpp +3 -2
package/cpp/ggml-alloc.h +1 -1
package/cpp/ggml-metal-whisper.metal +1497 -169
package/cpp/ggml-metal.m +530 -53
package/cpp/ggml-quants.c +2 -2
package/cpp/ggml.c +264 -99
package/cpp/ggml.h +21 -7
package/cpp/rn-whisper.cpp +2 -0
package/cpp/rn-whisper.h +3 -2
package/ios/RNWhisperContext.mm +8 -5
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/index.d.ts +5 -0
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/index.ts +5 -0
package/src/version.json +1 -1
package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19

package/cpp/ggml-metal.m CHANGED Viewed

@@ -66,9 +66,11 @@ struct wsp_ggml_metal_context {
     WSP_GGML_METAL_DECL_KERNEL(div_row);
     WSP_GGML_METAL_DECL_KERNEL(scale);
     WSP_GGML_METAL_DECL_KERNEL(scale_4);
-    WSP_GGML_METAL_DECL_KERNEL(silu);
+    WSP_GGML_METAL_DECL_KERNEL(tanh);
     WSP_GGML_METAL_DECL_KERNEL(relu);
     WSP_GGML_METAL_DECL_KERNEL(gelu);
+    WSP_GGML_METAL_DECL_KERNEL(gelu_quick);
+    WSP_GGML_METAL_DECL_KERNEL(silu);
     WSP_GGML_METAL_DECL_KERNEL(soft_max);
     WSP_GGML_METAL_DECL_KERNEL(soft_max_4);
     WSP_GGML_METAL_DECL_KERNEL(diag_mask_inf);
@@ -86,6 +88,7 @@ struct wsp_ggml_metal_context {
     WSP_GGML_METAL_DECL_KERNEL(get_rows_q5_K);
     WSP_GGML_METAL_DECL_KERNEL(get_rows_q6_K);
     WSP_GGML_METAL_DECL_KERNEL(rms_norm);
+    WSP_GGML_METAL_DECL_KERNEL(group_norm);
     WSP_GGML_METAL_DECL_KERNEL(norm);
     WSP_GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
     WSP_GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
@@ -102,6 +105,21 @@ struct wsp_ggml_metal_context {
     WSP_GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
     WSP_GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
     WSP_GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
+    //WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
+    //WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row);
+    //WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
+    WSP_GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
     WSP_GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
     WSP_GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
     WSP_GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@@ -130,8 +148,11 @@ struct wsp_ggml_metal_context {
     WSP_GGML_METAL_DECL_KERNEL(rope_f16);
     WSP_GGML_METAL_DECL_KERNEL(alibi_f32);
     WSP_GGML_METAL_DECL_KERNEL(im2col_f16);
+    WSP_GGML_METAL_DECL_KERNEL(upscale_f32);
+    WSP_GGML_METAL_DECL_KERNEL(pad_f32);
     WSP_GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
     WSP_GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
+    WSP_GGML_METAL_DECL_KERNEL(leaky_relu_f32);
     WSP_GGML_METAL_DECL_KERNEL(cpy_f32_f16);
     WSP_GGML_METAL_DECL_KERNEL(cpy_f32_f32);
     WSP_GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
@@ -140,6 +161,7 @@ struct wsp_ggml_metal_context {
     //WSP_GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
     //WSP_GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
     WSP_GGML_METAL_DECL_KERNEL(cpy_f16_f16);
+    WSP_GGML_METAL_DECL_KERNEL(cpy_f16_f32);
     WSP_GGML_METAL_DECL_KERNEL(concat);
     WSP_GGML_METAL_DECL_KERNEL(sqr);
     WSP_GGML_METAL_DECL_KERNEL(sum_rows);
@@ -318,9 +340,11 @@ struct wsp_ggml_metal_context * wsp_ggml_metal_init(int n_cb) {
         WSP_GGML_METAL_ADD_KERNEL(div_row);
         WSP_GGML_METAL_ADD_KERNEL(scale);
         WSP_GGML_METAL_ADD_KERNEL(scale_4);
-        WSP_GGML_METAL_ADD_KERNEL(silu);
+        WSP_GGML_METAL_ADD_KERNEL(tanh);
         WSP_GGML_METAL_ADD_KERNEL(relu);
         WSP_GGML_METAL_ADD_KERNEL(gelu);
+        WSP_GGML_METAL_ADD_KERNEL(gelu_quick);
+        WSP_GGML_METAL_ADD_KERNEL(silu);
         WSP_GGML_METAL_ADD_KERNEL(soft_max);
         WSP_GGML_METAL_ADD_KERNEL(soft_max_4);
         WSP_GGML_METAL_ADD_KERNEL(diag_mask_inf);
@@ -338,6 +362,7 @@ struct wsp_ggml_metal_context * wsp_ggml_metal_init(int n_cb) {
         WSP_GGML_METAL_ADD_KERNEL(get_rows_q5_K);
         WSP_GGML_METAL_ADD_KERNEL(get_rows_q6_K);
         WSP_GGML_METAL_ADD_KERNEL(rms_norm);
+        WSP_GGML_METAL_ADD_KERNEL(group_norm);
         WSP_GGML_METAL_ADD_KERNEL(norm);
         WSP_GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
         WSP_GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
@@ -354,6 +379,21 @@ struct wsp_ggml_metal_context * wsp_ggml_metal_init(int n_cb) {
         WSP_GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
         WSP_GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
         WSP_GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
+        //WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
+        //WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row);
+        //WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
+        WSP_GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
         if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
             WSP_GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
             WSP_GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
@@ -384,8 +424,11 @@ struct wsp_ggml_metal_context * wsp_ggml_metal_init(int n_cb) {
         WSP_GGML_METAL_ADD_KERNEL(rope_f16);
         WSP_GGML_METAL_ADD_KERNEL(alibi_f32);
         WSP_GGML_METAL_ADD_KERNEL(im2col_f16);
+        WSP_GGML_METAL_ADD_KERNEL(upscale_f32);
+        WSP_GGML_METAL_ADD_KERNEL(pad_f32);
         WSP_GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
         WSP_GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
+        WSP_GGML_METAL_ADD_KERNEL(leaky_relu_f32);
         WSP_GGML_METAL_ADD_KERNEL(cpy_f32_f16);
         WSP_GGML_METAL_ADD_KERNEL(cpy_f32_f32);
         WSP_GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
@@ -394,6 +437,7 @@ struct wsp_ggml_metal_context * wsp_ggml_metal_init(int n_cb) {
         //WSP_GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
         //WSP_GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
         WSP_GGML_METAL_ADD_KERNEL(cpy_f16_f16);
+        WSP_GGML_METAL_ADD_KERNEL(cpy_f16_f32);
         WSP_GGML_METAL_ADD_KERNEL(concat);
         WSP_GGML_METAL_ADD_KERNEL(sqr);
         WSP_GGML_METAL_ADD_KERNEL(sum_rows);
@@ -416,9 +460,11 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
     WSP_GGML_METAL_DEL_KERNEL(div_row);
     WSP_GGML_METAL_DEL_KERNEL(scale);
     WSP_GGML_METAL_DEL_KERNEL(scale_4);
-    WSP_GGML_METAL_DEL_KERNEL(silu);
+    WSP_GGML_METAL_DEL_KERNEL(tanh);
     WSP_GGML_METAL_DEL_KERNEL(relu);
     WSP_GGML_METAL_DEL_KERNEL(gelu);
+    WSP_GGML_METAL_DEL_KERNEL(gelu_quick);
+    WSP_GGML_METAL_DEL_KERNEL(silu);
     WSP_GGML_METAL_DEL_KERNEL(soft_max);
     WSP_GGML_METAL_DEL_KERNEL(soft_max_4);
     WSP_GGML_METAL_DEL_KERNEL(diag_mask_inf);
@@ -436,6 +482,7 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
     WSP_GGML_METAL_DEL_KERNEL(get_rows_q5_K);
     WSP_GGML_METAL_DEL_KERNEL(get_rows_q6_K);
     WSP_GGML_METAL_DEL_KERNEL(rms_norm);
+    WSP_GGML_METAL_DEL_KERNEL(group_norm);
     WSP_GGML_METAL_DEL_KERNEL(norm);
     WSP_GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
     WSP_GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
@@ -452,6 +499,21 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
     WSP_GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
     WSP_GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
     WSP_GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
+    //WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
+    //WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row);
+    //WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
+    WSP_GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
     if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
         WSP_GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
         WSP_GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
@@ -482,8 +544,11 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
     WSP_GGML_METAL_DEL_KERNEL(rope_f16);
     WSP_GGML_METAL_DEL_KERNEL(alibi_f32);
     WSP_GGML_METAL_DEL_KERNEL(im2col_f16);
+    WSP_GGML_METAL_DEL_KERNEL(upscale_f32);
+    WSP_GGML_METAL_DEL_KERNEL(pad_f32);
     WSP_GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
     WSP_GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
+    WSP_GGML_METAL_DEL_KERNEL(leaky_relu_f32);
     WSP_GGML_METAL_DEL_KERNEL(cpy_f32_f16);
     WSP_GGML_METAL_DEL_KERNEL(cpy_f32_f32);
     WSP_GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
@@ -492,6 +557,7 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
     //WSP_GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
     //WSP_GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
     WSP_GGML_METAL_DEL_KERNEL(cpy_f16_f16);
+    WSP_GGML_METAL_DEL_KERNEL(cpy_f16_f32);
     WSP_GGML_METAL_DEL_KERNEL(concat);
     WSP_GGML_METAL_DEL_KERNEL(sqr);
     WSP_GGML_METAL_DEL_KERNEL(sum_rows);
@@ -783,9 +849,11 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_tensor * op) {
     switch (op->op) {
         case WSP_GGML_OP_UNARY:
             switch (wsp_ggml_get_unary_op(op)) {
-                case WSP_GGML_UNARY_OP_SILU:
+                case WSP_GGML_UNARY_OP_TANH:
                 case WSP_GGML_UNARY_OP_RELU:
                 case WSP_GGML_UNARY_OP_GELU:
+                case WSP_GGML_UNARY_OP_GELU_QUICK:
+                case WSP_GGML_UNARY_OP_SILU:
                     return true;
                 default:
                     return false;
@@ -797,6 +865,7 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_tensor * op) {
         case WSP_GGML_OP_PERMUTE:
         case WSP_GGML_OP_CONCAT:
         case WSP_GGML_OP_ADD:
+        case WSP_GGML_OP_ACC:
         case WSP_GGML_OP_MUL:
         case WSP_GGML_OP_DIV:
         case WSP_GGML_OP_SCALE:
@@ -804,21 +873,50 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_tensor * op) {
         case WSP_GGML_OP_SUM_ROWS:
         case WSP_GGML_OP_SOFT_MAX:
         case WSP_GGML_OP_RMS_NORM:
+        case WSP_GGML_OP_GROUP_NORM:
         case WSP_GGML_OP_NORM:
         case WSP_GGML_OP_ALIBI:
         case WSP_GGML_OP_ROPE:
         case WSP_GGML_OP_IM2COL:
+        case WSP_GGML_OP_UPSCALE:
+        case WSP_GGML_OP_PAD:
         case WSP_GGML_OP_ARGSORT:
-        case WSP_GGML_OP_DUP:
-        case WSP_GGML_OP_CPY:
-        case WSP_GGML_OP_CONT:
+        case WSP_GGML_OP_LEAKY_RELU:
         case WSP_GGML_OP_MUL_MAT:
         case WSP_GGML_OP_MUL_MAT_ID:
             return true;
+        case WSP_GGML_OP_CPY:
+        case WSP_GGML_OP_DUP:
+        case WSP_GGML_OP_CONT:
+            {
+                switch (op->src[0]->type) {
+                    case WSP_GGML_TYPE_F32:
+                        switch (op->type) {
+                           case WSP_GGML_TYPE_F16:
+                           case WSP_GGML_TYPE_F32:
+                           case WSP_GGML_TYPE_Q8_0:
+                           case WSP_GGML_TYPE_Q4_0:
+                           case WSP_GGML_TYPE_Q4_1:
+                                return true;
+                           default:
+                                return false;
+                        }
+                    case WSP_GGML_TYPE_F16:
+                        switch (op->type) {
+                           case WSP_GGML_TYPE_F16:
+                           case WSP_GGML_TYPE_F32:
+                                return true;
+                           default:
+                                return false;
+                        }
+                    default:
+                        return false;
+                };
+            }
         case WSP_GGML_OP_DIAG_MASK_INF:
         case WSP_GGML_OP_GET_ROWS:
             {
-                return op->ne[0] % 4 == 0;
+                return op->ne[3] == 1;
             }
         default:
             return false;
@@ -894,7 +992,10 @@ void wsp_ggml_metal_graph_compute(
                         } break;
                 }
-                WSP_GGML_ASSERT(wsp_ggml_metal_supports_op(dst));
+                if (!wsp_ggml_metal_supports_op(dst)) {
+                    WSP_GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, wsp_ggml_op_desc(dst));
+                    WSP_GGML_ASSERT(!"unsupported op");
+                }
                 const int64_t  ne00 = src0 ? src0->ne[0] : 0;
                 const int64_t  ne01 = src0 ? src0->ne[1] : 0;
@@ -991,34 +1092,39 @@ void wsp_ggml_metal_graph_compute(
                     case WSP_GGML_OP_MUL:
                     case WSP_GGML_OP_DIV:
                         {
-                            WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
-                            WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1));
+                            const size_t offs = 0;
                             bool bcast_row = false;
                             int64_t nb = ne00;
-                            if (wsp_ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
+                            id<MTLComputePipelineState> pipeline = nil;
+                            if (wsp_ggml_nelements(src1) == ne10 && wsp_ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+                                WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
                                 // src1 is a row
                                 WSP_GGML_ASSERT(ne11 == 1);
                                 nb = ne00 / 4;
                                 switch (dst->op) {
-                                    case WSP_GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break;
-                                    case WSP_GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break;
-                                    case WSP_GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break;
+                                    case WSP_GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break;
+                                    case WSP_GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break;
+                                    case WSP_GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break;
                                     default: WSP_GGML_ASSERT(false);
                                 }
                                 bcast_row = true;
                             } else {
                                 switch (dst->op) {
-                                    case WSP_GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break;
-                                    case WSP_GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break;
-                                    case WSP_GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break;
+                                    case WSP_GGML_OP_ADD: pipeline = ctx->pipeline_add; break;
+                                    case WSP_GGML_OP_MUL: pipeline = ctx->pipeline_mul; break;
+                                    case WSP_GGML_OP_DIV: pipeline = ctx->pipeline_div; break;
                                     default: WSP_GGML_ASSERT(false);
                                 }
                             }
+                            [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@@ -1046,18 +1152,99 @@ void wsp_ggml_metal_graph_compute(
                             [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
                             [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
                             [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
                             if (bcast_row) {
                                 const int64_t n = wsp_ggml_nelements(dst)/4;
                                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                             } else {
-                                const int nth = MIN(1024, ne0);
+                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
                                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                             }
                         } break;
+                    case WSP_GGML_OP_ACC:
+                        {
+                            WSP_GGML_ASSERT(src0t == WSP_GGML_TYPE_F32);
+                            WSP_GGML_ASSERT(src1t == WSP_GGML_TYPE_F32);
+                            WSP_GGML_ASSERT(dstt  == WSP_GGML_TYPE_F32);
+                            WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
+                            WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1));
+                            const size_t pnb1 = ((int32_t *) dst->op_params)[0];
+                            const size_t pnb2 = ((int32_t *) dst->op_params)[1];
+                            const size_t pnb3 = ((int32_t *) dst->op_params)[2];
+                            const size_t offs = ((int32_t *) dst->op_params)[3];
+                            const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
+                            if (!inplace) {
+                                // run a separete kernel to cpy src->dst
+                                // not sure how to avoid this
+                                // TODO: make a simpler cpy_bytes kernel
+                                const int nth = MIN(1024, ne00);
+                                [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
+                                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            }
+                            [encoder setComputePipelineState:ctx->pipeline_add];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
+                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
+                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
+                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
+                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
+                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+                            const int nth = MIN(1024, ne0);
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
                     case WSP_GGML_OP_SCALE:
                         {
                             WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
@@ -1081,16 +1268,15 @@ void wsp_ggml_metal_graph_compute(
                         } break;
                     case WSP_GGML_OP_UNARY:
                         switch (wsp_ggml_get_unary_op(gf->nodes[i])) {
-                            case WSP_GGML_UNARY_OP_SILU:
+                            case WSP_GGML_UNARY_OP_TANH:
                                 {
-                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setComputePipelineState:ctx->pipeline_tanh];
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                                     const int64_t n = wsp_ggml_nelements(dst);
-                                    WSP_GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             case WSP_GGML_UNARY_OP_RELU:
                                 {
@@ -1111,6 +1297,28 @@ void wsp_ggml_metal_graph_compute(
                                     const int64_t n = wsp_ggml_nelements(dst);
                                     WSP_GGML_ASSERT(n % 4 == 0);
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case WSP_GGML_UNARY_OP_GELU_QUICK:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_gelu_quick];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                    const int64_t n = wsp_ggml_nelements(dst);
+                                    WSP_GGML_ASSERT(n % 4 == 0);
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case WSP_GGML_UNARY_OP_SILU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                    const int64_t n = wsp_ggml_nelements(dst);
+                                    WSP_GGML_ASSERT(n % 4 == 0);
                                     [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             default:
@@ -1185,6 +1393,8 @@ void wsp_ggml_metal_graph_compute(
                             [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                             if (id_src1) {
                                 [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                            } else {
+                                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
                             }
                             [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
                             [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
@@ -1436,7 +1646,7 @@ void wsp_ggml_metal_graph_compute(
                                 else if (src0t == WSP_GGML_TYPE_Q6_K) {
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 } else {
-                                    int64_t ny = (ne11 + nrows - 1)/nrows;
+                                    const int64_t ny = (ne11 + nrows - 1)/nrows;
                                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                             }
@@ -1448,7 +1658,7 @@ void wsp_ggml_metal_graph_compute(
                             WSP_GGML_ASSERT(src0t == WSP_GGML_TYPE_I32);
-                            const int n_as = ne00;
+                            const int n_as = ((int32_t *) dst->op_params)[1];
                             // TODO: make this more general
                             WSP_GGML_ASSERT(n_as <= 8);
@@ -1480,14 +1690,22 @@ void wsp_ggml_metal_graph_compute(
                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                             // to the matrix-vector kernel
-                            int ne11_mm_min = 0;
+                            int ne11_mm_min = 1;
                             const int idx = ((int32_t *) dst->op_params)[0];
+                            // batch size
+                            WSP_GGML_ASSERT(ne01 == ne11);
+                            const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
                             // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                             // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                ne11 > ne11_mm_min) {
+                            // !!!
+                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
+                            //       indirect matrix multiplication
+                            // !!!
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
                                 switch (src2->type) {
                                     case WSP_GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
                                     case WSP_GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
@@ -1506,19 +1724,22 @@ void wsp_ggml_metal_graph_compute(
                                 [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                                 [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                                 [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:3];
-                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:4];
-                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:5];
-                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:6];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
-                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:15];
+                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
+                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
+                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
+                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
+                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
+                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
+                                [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
+                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
+                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
+                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
+                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
+                                [encoder setBytes:&_ne1    length:sizeof(_ne1) atIndex:14];
+                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
+                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
+                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
+                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
                                 // TODO: how to make this an array? read Metal docs
                                 for (int j = 0; j < n_as; ++j) {
                                     struct wsp_ggml_tensor * src_cur = dst->src[2 + j];
@@ -1526,11 +1747,157 @@ void wsp_ggml_metal_graph_compute(
                                     size_t offs_src_cur = 0;
                                     id<MTLBuffer> id_src_cur = wsp_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j];
+                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                                 }
                                 [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                // TODO: processing one row at a time (ne11 -> 1) is not efficient
+                                [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                            } else {
+                                int nth0 = 32;
+                                int nth1 = 1;
+                                int nrows = 1;
+                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                                // use custom matrix x vector kernel
+                                switch (src2t) {
+                                    case WSP_GGML_TYPE_F32:
+                                        {
+                                            WSP_GGML_ASSERT(src1t == WSP_GGML_TYPE_F32);
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_F16:
+                                        {
+                                            WSP_GGML_ASSERT(src1t == WSP_GGML_TYPE_F32);
+                                            nth0 = 32;
+                                            nth1 = 1;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q4_0:
+                                        {
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q4_1:
+                                        {
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q5_0:
+                                        {
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q5_1:
+                                        {
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q8_0:
+                                        {
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q2_K:
+                                        {
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q3_K:
+                                        {
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q4_K:
+                                        {
+                                            nth0 = 4; //1;
+                                            nth1 = 8; //32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q5_K:
+                                        {
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32];
+                                        } break;
+                                    case WSP_GGML_TYPE_Q6_K:
+                                        {
+                                            nth0 = 2;
+                                            nth1 = 32;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
+                                        } break;
+                                    default:
+                                        {
+                                            WSP_GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                            WSP_GGML_ASSERT(false && "not implemented");
+                                        }
+                                };
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
+                                [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
+                                [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
+                                [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
+                                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
+                                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
+                                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
+                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
+                                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
+                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
+                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
+                                [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
+                                // TODO: how to make this an array? read Metal docs
+                                for (int j = 0; j < n_as; ++j) {
+                                    struct wsp_ggml_tensor * src_cur = dst->src[2 + j];
+                                    size_t offs_src_cur = 0;
+                                    id<MTLBuffer> id_src_cur = wsp_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
+                                }
+                                if (src2t == WSP_GGML_TYPE_Q4_0 || src2t == WSP_GGML_TYPE_Q4_1 ||
+                                    src2t == WSP_GGML_TYPE_Q5_0 || src2t == WSP_GGML_TYPE_Q5_1 || src2t == WSP_GGML_TYPE_Q8_0 ||
+                                    src2t == WSP_GGML_TYPE_Q2_K) { // || src2t == WSP_GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src2t == WSP_GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src2t == WSP_GGML_TYPE_Q3_K) {
+#ifdef WSP_GGML_QKK_64
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                                }
+                                else if (src2t == WSP_GGML_TYPE_Q5_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src2t == WSP_GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                } else {
+                                    const int64_t ny = (_ne1 + nrows - 1)/nrows;
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
                             }
                         } break;
                     case WSP_GGML_OP_GET_ROWS:
@@ -1551,16 +1918,19 @@ void wsp_ggml_metal_graph_compute(
                                 default: WSP_GGML_ASSERT(false && "not implemented");
                             }
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
                             [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
                             [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:5];
-                            const int64_t n = wsp_ggml_nelements(src1);
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
+                            [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
+                            [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
+                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                         } break;
                     case WSP_GGML_OP_RMS_NORM:
                         {
@@ -1587,6 +1957,38 @@ void wsp_ggml_metal_graph_compute(
                             [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
+                    case WSP_GGML_OP_GROUP_NORM:
+                        {
+                            WSP_GGML_ASSERT(ne00 % 4 == 0);
+                            //float eps;
+                            //memcpy(&eps, dst->op_params, sizeof(float));
+                            const float eps = 1e-6f; // TODO: temporarily hardcoded
+                            const int32_t n_groups = ((int32_t *) dst->op_params)[0];
+                            int nth = 32; // SIMD width
+                            //while (nth < ne00/4 && nth < 1024) {
+                            //    nth *= 2;
+                            //}
+                            [encoder setComputePipelineState:ctx->pipeline_group_norm];
+                            [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
+                            [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
+                            [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
+                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
                     case WSP_GGML_OP_NORM:
                         {
                             float eps;
@@ -1756,6 +2158,65 @@ void wsp_ggml_metal_graph_compute(
                             [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
                         } break;
+                    case WSP_GGML_OP_UPSCALE:
+                        {
+                            WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
+                            const int sf = dst->op_params[0];
+                            [encoder setComputePipelineState:ctx->pipeline_upscale_f32];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                            [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
+                            const int nth = MIN(1024, ne0);
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case WSP_GGML_OP_PAD:
+                        {
+                            WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
+                            [encoder setComputePipelineState:ctx->pipeline_pad_f32];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                            const int nth = MIN(1024, ne0);
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
                     case WSP_GGML_OP_ARGSORT:
                         {
                             WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
@@ -1777,6 +2238,22 @@ void wsp_ggml_metal_graph_compute(
                             [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
                         } break;
+                    case WSP_GGML_OP_LEAKY_RELU:
+                        {
+                            WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
+                            float slope;
+                            memcpy(&slope, dst->op_params, sizeof(float));
+                            [encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32];
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
+                            [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
+                            const int64_t n = wsp_ggml_nelements(dst);
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
                     case WSP_GGML_OP_DUP:
                     case WSP_GGML_OP_CPY:
                     case WSP_GGML_OP_CONT:
@@ -1805,7 +2282,7 @@ void wsp_ggml_metal_graph_compute(
                                     {
                                         switch (dstt) {
                                             case WSP_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
-                                            case WSP_GGML_TYPE_F32: WSP_GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
+                                            case WSP_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break;
                                             default: WSP_GGML_ASSERT(false && "not implemented");
                                         };
                                     } break;