npm - whisper.rn - Versions diffs - 0.5.1 → 0.5.3 - Mend

whisper.rn 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/cpp/ggml-metal/ggml-metal-ops.cpp CHANGED Viewed

@@ -10,6 +10,8 @@
 #include <cassert>
 #include <algorithm>
+#include <limits>
+#include <cmath>
 static wsp_ggml_metal_buffer_id wsp_ggml_metal_get_buffer_id(const wsp_ggml_tensor * t) {
     if (!t) {
@@ -226,6 +228,10 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, node->src[0], nb);
             WSP_GGML_TENSOR_LOCALS( int64_t, ne1, node->src[1], ne);
             WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, node->src[1], nb);
+            WSP_GGML_TENSOR_LOCALS( int64_t, ne2, node->src[2], ne);
+            WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, node->src[2], nb);
+            WSP_GGML_TENSOR_LOCALS( int64_t, ne3, node->src[3], ne);
+            WSP_GGML_TENSOR_LOCALS(uint64_t, nb3, node->src[3], nb);
             WSP_GGML_TENSOR_LOCALS( int64_t, ne,  node,         ne);
             WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  node,         nb);
@@ -237,6 +243,14 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
                 WSP_GGML_LOG_DEBUG("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, wsp_ggml_type_name(node->src[1]->type), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
                         wsp_ggml_is_contiguous(node->src[1]), node->src[1]->name);
             }
+            if (node->src[2]) {
+                WSP_GGML_LOG_DEBUG("%s: src2 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, wsp_ggml_type_name(node->src[2]->type), ne20, ne21, ne22, ne23, nb20, nb21, nb22, nb23,
+                        wsp_ggml_is_contiguous(node->src[2]), node->src[2]->name);
+            }
+            if (node->src[3]) {
+                WSP_GGML_LOG_DEBUG("%s: src3 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, wsp_ggml_type_name(node->src[3]->type), ne30, ne31, ne32, ne33, nb30, nb31, nb32, nb33,
+                        wsp_ggml_is_contiguous(node->src[3]), node->src[3]->name);
+            }
             if (node) {
                 WSP_GGML_LOG_DEBUG("%s: node  - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, wsp_ggml_type_name(node->type), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
                         node->name);
@@ -289,11 +303,19 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = wsp_ggml_metal_op_glu(ctx, idx);
             } break;
+        case WSP_GGML_OP_SUM:
+            {
+                n_fuse = wsp_ggml_metal_op_sum(ctx, idx);
+            } break;
         case WSP_GGML_OP_SUM_ROWS:
         case WSP_GGML_OP_MEAN:
             {
                 n_fuse = wsp_ggml_metal_op_sum_rows(ctx, idx);
             } break;
+        case WSP_GGML_OP_CUMSUM:
+            {
+                n_fuse = wsp_ggml_metal_op_cumsum(ctx, idx);
+            } break;
         case WSP_GGML_OP_SOFT_MAX:
             {
                 n_fuse = wsp_ggml_metal_op_soft_max(ctx, idx);
@@ -348,10 +370,18 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = wsp_ggml_metal_op_im2col(ctx, idx);
             } break;
+        case WSP_GGML_OP_CONV_2D:
+            {
+                n_fuse = wsp_ggml_metal_op_conv_2d(ctx, idx);
+            } break;
         case WSP_GGML_OP_CONV_TRANSPOSE_1D:
             {
                 n_fuse = wsp_ggml_metal_op_conv_transpose_1d(ctx, idx);
             } break;
+        case WSP_GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                n_fuse = wsp_ggml_metal_op_conv_transpose_2d(ctx, idx);
+            } break;
         case WSP_GGML_OP_UPSCALE:
             {
                 n_fuse = wsp_ggml_metal_op_upscale(ctx, idx);
@@ -398,6 +428,14 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = wsp_ggml_metal_op_argmax(ctx, idx);
             } break;
+        case WSP_GGML_OP_OPT_STEP_ADAMW:
+            {
+                n_fuse = wsp_ggml_metal_op_opt_step_adamw(ctx, idx);
+            } break;
+        case WSP_GGML_OP_OPT_STEP_SGD:
+            {
+                n_fuse = wsp_ggml_metal_op_opt_step_sgd(ctx, idx);
+            } break;
        default:
             {
                 WSP_GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, wsp_ggml_op_name(node->op));
@@ -506,7 +544,7 @@ int wsp_ggml_metal_op_repeat(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_repeat(lib, op->type);
@@ -552,7 +590,7 @@ int wsp_ggml_metal_op_acc(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F32);
     WSP_GGML_ASSERT(op->src[1]->type == WSP_GGML_TYPE_F32);
@@ -577,6 +615,7 @@ int wsp_ggml_metal_op_acc(wsp_ggml_metal_op_t ctx, int idx) {
         wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
         wsp_ggml_metal_kargs_cpy args = {
+            /*.nk0  =*/ ne00,
             /*.ne00 =*/ ne00,
             /*.ne01 =*/ ne01,
             /*.ne02 =*/ ne02,
@@ -660,7 +699,7 @@ int wsp_ggml_metal_op_scale(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float scale;
     float bias;
@@ -699,7 +738,7 @@ int wsp_ggml_metal_op_clamp(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float min;
     float max;
@@ -738,7 +777,7 @@ int wsp_ggml_metal_op_unary(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     int64_t n = wsp_ggml_nelements(op);
@@ -768,7 +807,7 @@ int wsp_ggml_metal_op_glu(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     if (op->src[1]) {
         WSP_GGML_ASSERT(wsp_ggml_are_same_shape(op->src[0], op->src[1]));
@@ -800,18 +839,6 @@ int wsp_ggml_metal_op_glu(wsp_ggml_metal_op_t ctx, int idx) {
     const int32_t nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00/2);
-    //[encoder setComputePipelineState:pipeline];
-    //[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-    //if (src1) {
-    //    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-    //} else {
-    //    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-    //}
-    //[encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-    //[encoder setBytes:&args length:sizeof(args) atIndex:3];
-    //[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
@@ -827,6 +854,43 @@ int wsp_ggml_metal_op_glu(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_sum(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op  = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    const uint64_t n = (uint64_t) wsp_ggml_nelements(op->src[0]);
+    wsp_ggml_metal_kargs_sum args = {
+        /*.np =*/ n,
+    };
+    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_sum(lib, op);
+    int nth = 32; // SIMD width
+    while (nth < (int) n && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+    nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    nth = std::min(nth, (int) n);
+    const int nsg = (nth + 31) / 32;
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         2);
+    wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, nsg * sizeof(float), 0);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
+    return 1;
+}
 int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_tensor * op = ctx->node(idx);
@@ -836,7 +900,7 @@ int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_kargs_sum_rows args = {
         /*.ne00 =*/ ne00,
@@ -870,14 +934,6 @@ int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
     const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
-    //[encoder setComputePipelineState:pipeline];
-    //[encoder setBytes:&args length:sizeof(args) atIndex:0];
-    //[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-    //[encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-    //[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-    //[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
@@ -890,6 +946,149 @@ int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_cumsum(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(op->src[0]));
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    wsp_ggml_metal_pipeline_t pipeline_blk = wsp_ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
+        nth *= 2;
+    }
+    WSP_GGML_ASSERT(ne00 <= nth*nth);
+    const int64_t net0 = (ne00 + nth - 1) / nth;
+    const int64_t net1 = ne01;
+    const int64_t net2 = ne02;
+    const int64_t net3 = ne03;
+    const uint64_t nbt0 = sizeof(float);
+    const uint64_t nbt1 = net0*nbt0;
+    const uint64_t nbt2 = net1*nbt1;
+    const uint64_t nbt3 = net2*nbt2;
+    const size_t smem = WSP_GGML_PAD(32*sizeof(float), 16);
+    wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
+    wsp_ggml_metal_buffer_id bid_dst  = wsp_ggml_metal_get_buffer_id(op);
+    wsp_ggml_metal_buffer_id bid_tmp = bid_dst;
+    bid_tmp.offs += wsp_ggml_nbytes(op);
+    {
+        wsp_ggml_metal_kargs_cumsum_blk args = {
+            /*.ne00 =*/ ne00,
+            /*.ne01 =*/ ne01,
+            /*.ne02 =*/ ne02,
+            /*.ne03 =*/ ne03,
+            /*.nb00 =*/ nb00,
+            /*.nb01 =*/ nb01,
+            /*.nb02 =*/ nb02,
+            /*.nb03 =*/ nb03,
+            /*.net0 =*/ net0,
+            /*.net1 =*/ net1,
+            /*.net2 =*/ net2,
+            /*.net3 =*/ net3,
+            /*.nbt0 =*/ nbt0,
+            /*.nbt1 =*/ nbt1,
+            /*.nbt2 =*/ nbt2,
+            /*.nbt3 =*/ nbt3,
+            /*.outb =*/ ne00 > nth,
+        };
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline_blk);
+        wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp,  2);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  3);
+        wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, net0*ne01, ne02, ne03, nth, 1, 1);
+    }
+    if (ne00 > nth) {
+        wsp_ggml_metal_op_concurrency_reset(ctx);
+        {
+            wsp_ggml_metal_kargs_cumsum_blk args = {
+                /*.ne00 =*/ net0,
+                /*.ne01 =*/ net1,
+                /*.ne02 =*/ net2,
+                /*.ne03 =*/ net3,
+                /*.nb00 =*/ nbt0,
+                /*.nb01 =*/ nbt1,
+                /*.nb02 =*/ nbt2,
+                /*.nb03 =*/ nbt3,
+                /*.net0 =*/ net0,
+                /*.net1 =*/ net1,
+                /*.net2 =*/ net2,
+                /*.net3 =*/ net3,
+                /*.nbt0 =*/ nbt0,
+                /*.nbt1 =*/ nbt1,
+                /*.nbt2 =*/ nbt2,
+                /*.nbt3 =*/ nbt3,
+                /*.outb =*/ false,
+            };
+            wsp_ggml_metal_encoder_set_pipeline(enc, pipeline_blk);
+            wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp, 1);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp, 2);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp, 3);
+            wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+            wsp_ggml_metal_encoder_dispatch_threadgroups(enc, net1, net2, net3, nth, 1, 1);
+        }
+        wsp_ggml_metal_op_concurrency_reset(ctx);
+        {
+            wsp_ggml_metal_pipeline_t pipeline_add = wsp_ggml_metal_library_get_pipeline_cumsum_add(lib, op);
+            wsp_ggml_metal_kargs_cumsum_add args = {
+                /*.ne00 =*/ ne00,
+                /*.ne01 =*/ ne01,
+                /*.ne02 =*/ ne02,
+                /*.ne03 =*/ ne03,
+                /*.nb00 =*/ nb00,
+                /*.nb01 =*/ nb01,
+                /*.nb02 =*/ nb02,
+                /*.nb03 =*/ nb03,
+                /*.net0 =*/ net0,
+                /*.net1 =*/ net1,
+                /*.net2 =*/ net2,
+                /*.net3 =*/ net3,
+                /*.nbt0 =*/ nbt0,
+                /*.nbt1 =*/ nbt1,
+                /*.nbt2 =*/ nbt2,
+                /*.nbt3 =*/ nbt3,
+            };
+            wsp_ggml_metal_encoder_set_pipeline(enc, pipeline_add);
+            wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp, 1);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst, 2);
+            wsp_ggml_metal_encoder_dispatch_threadgroups(enc, net0*ne01, ne02, ne03, nth, 1, 1);
+        }
+    }
+    return 1;
+}
 int wsp_ggml_metal_op_get_rows(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_tensor * op = ctx->node(idx);
@@ -901,28 +1100,36 @@ int wsp_ggml_metal_op_get_rows(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
     wsp_ggml_metal_kargs_get_rows args = {
-        /*.ne00 =*/ ne00,
-        /*.nb01 =*/ nb01,
-        /*.nb02 =*/ nb02,
-        /*.ne10 =*/ ne10,
-        /*.nb10 =*/ nb10,
-        /*.nb11 =*/ nb11,
-        /*.nb1  =*/ nb1,
-        /*.nb2  =*/ nb2,
+        /*.ne00t =*/ wsp_ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
+        /*.ne00  =*/ ne00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne10  =*/ ne10,
+        /*.nb10  =*/ nb10,
+        /*.nb11  =*/ nb11,
+        /*.nb12  =*/ nb12,
+        /*.nb1   =*/ nb1,
+        /*.nb2   =*/ nb2,
+        /*.nb3   =*/ nb3,
     };
+    const int nth = std::min(args.ne00t, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    const int nw0 = (args.ne00t + nth - 1)/nth;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         3);
-    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne10, ne11, ne12, 32, 1, 1);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nw0*ne10, ne11, ne12, nth, 1, 1);
     return 1;
 }
@@ -938,7 +1145,7 @@ int wsp_ggml_metal_op_set_rows(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
@@ -1002,7 +1209,7 @@ int wsp_ggml_metal_op_soft_max(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float scale;
     float max_bias;
@@ -1090,7 +1297,7 @@ int wsp_ggml_metal_op_ssm_conv(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_kargs_ssm_conv args = {
         /*.ne00 =*/ ne00,
@@ -1117,7 +1324,7 @@ int wsp_ggml_metal_op_ssm_conv(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
     wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
     wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
-    wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 3);
+    wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op),         3);
     wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
@@ -1145,7 +1352,7 @@ int wsp_ggml_metal_op_ssm_scan(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne6, op->src[6], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb6, op->src[6], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const wsp_ggml_tensor * src3 = op->src[3];
     const wsp_ggml_tensor * src4 = op->src[4];
@@ -1172,25 +1379,36 @@ int wsp_ggml_metal_op_ssm_scan(wsp_ggml_metal_op_t ctx, int idx) {
         /*.n_seq_tokens =*/ n_seq_tokens,
         /*.n_seqs       =*/ n_seqs,
         /*.s_off        =*/ wsp_ggml_nelements(op->src[1]) * sizeof(float),
+        /*.nb00         =*/ nb00,
         /*.nb01         =*/ nb01,
         /*.nb02         =*/ nb02,
         /*.nb03         =*/ nb03,
+        /*.nb10         =*/ nb10,
         /*.nb11         =*/ nb11,
         /*.nb12         =*/ nb12,
+        /*.ns12         =*/ nb12/nb10,
         /*.nb13         =*/ nb13,
+        /*.nb20         =*/ nb20,
         /*.nb21         =*/ nb21,
+        /*.ns21         =*/ nb21/nb20,
         /*.nb22         =*/ nb22,
+        /*.ne30         =*/ ne30,
         /*.nb31         =*/ nb31,
         /*.nb41         =*/ nb41,
         /*.nb42         =*/ nb42,
+        /*.ns42         =*/ nb42/nb40,
         /*.nb43         =*/ nb43,
         /*.nb51         =*/ nb51,
         /*.nb52         =*/ nb52,
+        /*.ns52         =*/ nb52/nb50,
         /*.nb53         =*/ nb53,
+        /*.nb0          =*/ nb0,
     };
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_ssm_scan(lib, op);
+    WSP_GGML_ASSERT(d_state <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
     const size_t sms = wsp_ggml_metal_pipeline_get_smem(pipeline);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
@@ -1206,13 +1424,7 @@ int wsp_ggml_metal_op_ssm_scan(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
-    if (ne30 == 1) {
-        // Mamba-2
-        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
-    } else {
-        WSP_GGML_ASSERT(d_inner == 1);
-        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n_head, n_seqs, 1, d_state, 1, 1);
-    }
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
     return 1;
 }
@@ -1226,7 +1438,7 @@ int wsp_ggml_metal_op_rwkv(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const int64_t B = op->op == WSP_GGML_OP_RWKV_WKV6 ? op->src[5]->ne[1] : op->src[6]->ne[1];
     const int64_t T = op->src[0]->ne[2];
@@ -1267,32 +1479,29 @@ int wsp_ggml_metal_op_cpy(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
     WSP_GGML_ASSERT(ne00 % wsp_ggml_blck_size(op->src[0]->type) == 0);
-    // TODO: support
-    //const int32_t nk00 = ne00/wsp_ggml_blck_size(op->type);
-    const int32_t nk00 = ne00;
-    int nth = 32; // SIMD width
-    while (nth < nk00 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
-        nth *= 2;
+    int64_t nk0 = ne00;
+    if (wsp_ggml_is_quantized(op->src[0]->type)) {
+        nk0 = ne00/16;
+    } else if (wsp_ggml_is_quantized(op->type)) {
+        nk0 = ne00/wsp_ggml_blck_size(op->type);
     }
-    nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    int nth = std::min<int>(nk0, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
     // when rows are small, we can batch them together in a single threadgroup
     int nrptg = 1;
     // TODO: relax this constraint in the future
     if (wsp_ggml_blck_size(op->src[0]->type) == 1 && wsp_ggml_blck_size(op->type) == 1) {
-        if (nth > nk00) {
-            nrptg = (nth + nk00 - 1)/nk00;
-            nth   = nk00;
+        if (nth > nk0) {
+            nrptg = (nth + nk0 - 1)/nk0;
+            nth   = nk0;
             if (nrptg*nth > wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
                 nrptg--;
@@ -1300,10 +1509,11 @@ int wsp_ggml_metal_op_cpy(wsp_ggml_metal_op_t ctx, int idx) {
         }
     }
-    nth = std::min(nth, nk00);
+    nth = std::min<int>(nth, nk0);
     wsp_ggml_metal_kargs_cpy args = {
-        /*.ne00 =*/ nk00,
+        /*.nk0  =*/ nk0,
+        /*.ne00 =*/ ne00,
         /*.ne01 =*/ ne01,
         /*.ne02 =*/ ne02,
         /*.ne03 =*/ ne03,
@@ -1321,12 +1531,14 @@ int wsp_ggml_metal_op_cpy(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb3  =*/ nb3,
     };
+    const int nw0 = nrptg == 1 ? (nk0 + nth - 1)/nth : 1;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         2);
-    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, nrptg, 1);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nw0*(ne01 + nrptg - 1)/nrptg, ne02, ne03, nth, nrptg, 1);
     return 1;
 }
@@ -1340,7 +1552,7 @@ int wsp_ggml_metal_op_pool_2d(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const int32_t * opts = op->op_params;
     wsp_ggml_op_pool op_pool = (wsp_ggml_op_pool) opts[0];
@@ -1404,7 +1616,7 @@ int wsp_ggml_metal_op_mul_mat(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     WSP_GGML_ASSERT(ne00 == ne10);
@@ -1520,9 +1732,8 @@ int wsp_ggml_metal_op_mul_mat(wsp_ggml_metal_op_t ctx, int idx) {
         !wsp_ggml_is_transposed(op->src[1]) &&
         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-        props_dev->has_simdgroup_mm && ne00 >= 64 &&
-        (ne11 > ne11_mm_min || (wsp_ggml_is_quantized(op->src[0]->type) && ne12 > 1))) {
-        //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+        props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) {
+        //WSP_GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
         // some Metal matrix data types require aligned pointers
         // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
@@ -1646,7 +1857,7 @@ int wsp_ggml_metal_op_mul_mat_id(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     // src2 = ids
     WSP_GGML_ASSERT(op->src[2]->type == WSP_GGML_TYPE_I32);
@@ -1875,20 +2086,114 @@ bool wsp_ggml_metal_op_flash_attn_ext_use_vec(const wsp_ggml_tensor * op) {
     return (ne01 < 20) && (ne00 % 32 == 0);
 }
+size_t wsp_ggml_metal_op_flash_attn_ext_extra_pad(const wsp_ggml_tensor * op) {
+    assert(op->op == WSP_GGML_OP_FLASH_ATTN_EXT);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
+    size_t res = 0;
+    const bool has_mask = op->src[3] != nullptr;
+    if (wsp_ggml_metal_op_flash_attn_ext_use_vec(op)) {
+        // note: always reserve the padding space to avoid graph reallocations
+        //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
+        const bool has_kvpad = true;
+        if (has_kvpad) {
+            res += OP_FLASH_ATTN_EXT_VEC_NCPSG*(
+                nb11*ne12*ne13 +
+                nb21*ne22*ne23 +
+                (has_mask ? wsp_ggml_type_size(WSP_GGML_TYPE_F16)*ne31*ne32*ne33 : 0));
+        }
+    } else {
+        //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_NCPSG != 0;
+        const bool has_kvpad = true;
+        if (has_kvpad) {
+            res += OP_FLASH_ATTN_EXT_NCPSG*(
+                nb11*ne12*ne13 +
+                nb21*ne22*ne23 +
+                (has_mask ? wsp_ggml_type_size(WSP_GGML_TYPE_F16)*ne31*ne32*ne33 : 0));
+        }
+    }
+    return res;
+}
+size_t wsp_ggml_metal_op_flash_attn_ext_extra_blk(const wsp_ggml_tensor * op) {
+    assert(op->op == WSP_GGML_OP_FLASH_ATTN_EXT);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+  //WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+  //WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+  //WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+  //WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
+  //WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
+    size_t res = 0;
+    const bool has_mask = op->src[3] != nullptr;
+    if (!has_mask) {
+        return res;
+    }
+    const bool is_vec = wsp_ggml_metal_op_flash_attn_ext_use_vec(op);
+    // this optimization is not useful for the vector kernels
+    // note: always reserve the blk buffer to avoid graph reallocations
+    //if (is_vec) {
+    //    return res;
+    //}
+    const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPTG : OP_FLASH_ATTN_EXT_NQPTG;
+    const int ncpsg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NCPSG : OP_FLASH_ATTN_EXT_NCPSG;
+    const int64_t ne1 = (ne01 + nqptg - 1)/nqptg;
+    const int64_t ne0 = (ne30 + ncpsg - 1)/ncpsg;
+    res += WSP_GGML_PAD(wsp_ggml_type_size(WSP_GGML_TYPE_I8)*ne0*ne1*ne32*ne33, 32);
+    return res;
+}
 size_t wsp_ggml_metal_op_flash_attn_ext_extra_tmp(const wsp_ggml_tensor * op) {
     assert(op->op == WSP_GGML_OP_FLASH_ATTN_EXT);
-    const int64_t nwg = 32;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+  //WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+  //WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
+  //WSP_GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
+  //WSP_GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
+    size_t res = 0;
+    // note: always reserve the temp buffer to avoid graph reallocations
+    //if (wsp_ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    if (true) {
+        const int64_t nwg = 32;
+        const int64_t ne01_max = std::min(ne01, 32);
-    const int64_t ne01 = op->src[0]->ne[1];
-    const int64_t ne02 = op->src[0]->ne[2];
-    const int64_t ne03 = op->src[0]->ne[3];
-    const int64_t ne20 = op->src[2]->ne[0];
+        // temp buffer for writing the results from each workgroup
+        // - ne20: the size of the Value head
+        // -  + 2: the S and M values for each intermediate result
+        res += wsp_ggml_type_size(WSP_GGML_TYPE_F32)*(ne01_max*ne02*ne03*nwg*(ne20 + 2));
+    }
-    // temp buffer for writing the results from each workgroup
-    // - ne20: the size of the Value head
-    // -  + 2: the S and M values for each intermediate result
-    return wsp_ggml_type_size(WSP_GGML_TYPE_F32)*(ne01*ne02*ne03*nwg*(ne20 + 2));
+    return res;
 }
 int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
@@ -1910,8 +2215,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS( int32_t, nb,  op,         nb);
-    WSP_GGML_ASSERT(ne00 % 4  == 0);
-    WSP_GGML_ASSERT(ne11 % 32 == 0);
+    WSP_GGML_ASSERT(ne00 % 4 == 0);
     WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F32);
     WSP_GGML_ASSERT(op->src[1]->type == op->src[2]->type);
@@ -1921,8 +2225,8 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_ASSERT(ne12 == ne22);
     WSP_GGML_ASSERT(!op->src[3] || op->src[3]->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(!op->src[3] || op->src[3]->ne[1] >= WSP_GGML_PAD(op->src[0]->ne[1], 8) &&
-            "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
+    WSP_GGML_ASSERT(!op->src[3] || op->src[3]->ne[1] >= op->src[0]->ne[1] &&
+            "the Flash-Attention Metal kernel requires the mask to be at least n_queries big");
     float scale;
     float max_bias;
@@ -1949,15 +2253,107 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_ASSERT(ne01 < 65536);
+    wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
+    wsp_ggml_metal_buffer_id bid_src1 = wsp_ggml_metal_get_buffer_id(op->src[1]);
+    wsp_ggml_metal_buffer_id bid_src2 = wsp_ggml_metal_get_buffer_id(op->src[2]);
+    wsp_ggml_metal_buffer_id bid_src3 = has_mask  ? wsp_ggml_metal_get_buffer_id(op->src[3]) : bid_src0;
+    wsp_ggml_metal_buffer_id bid_src4 = has_sinks ? wsp_ggml_metal_get_buffer_id(op->src[4]) : bid_src0;
+    wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_get_buffer_id(op);
+    wsp_ggml_metal_buffer_id bid_pad = bid_dst;
+    bid_pad.offs += wsp_ggml_nbytes(op);
+    wsp_ggml_metal_buffer_id bid_blk = bid_pad;
+    bid_blk.offs += wsp_ggml_metal_op_flash_attn_ext_extra_pad(op);
+    wsp_ggml_metal_buffer_id bid_tmp = bid_blk;
+    bid_tmp.offs += wsp_ggml_metal_op_flash_attn_ext_extra_blk(op);
     if (!wsp_ggml_metal_op_flash_attn_ext_use_vec(op)) {
         // half8x8 kernel
-        const int64_t nqptg = 8;  // queries per threadgroup    !! sync with kernel template arguments !!
-        const int64_t ncpsg = 64; // cache values per simdgroup !! sync with kernel template arguments !!
+        const int nqptg = OP_FLASH_ATTN_EXT_NQPTG; // queries per threadgroup
+        const int ncpsg = OP_FLASH_ATTN_EXT_NCPSG; // cache values per simdgroup
         WSP_GGML_ASSERT(nqptg <= 32);
         WSP_GGML_ASSERT(nqptg  % 8  == 0);
         WSP_GGML_ASSERT(ncpsg  % 32 == 0);
+        bool need_sync = false;
+        const bool has_kvpad = ne11 % ncpsg != 0;
+        if (has_kvpad) {
+            assert(wsp_ggml_metal_op_flash_attn_ext_extra_pad(op) != 0);
+            wsp_ggml_metal_kargs_flash_attn_ext_pad args0 = {
+                /*.ne11    =*/ne11,
+                /*.ne_12_2 =*/ne12,
+                /*.ne_12_3 =*/ne13,
+                /*.nb11    =*/nb11,
+                /*.nb12    =*/nb12,
+                /*.nb13    =*/nb13,
+                /*.nb21    =*/nb21,
+                /*.nb22    =*/nb22,
+                /*.nb23    =*/nb23,
+                /*.ne31    =*/ne31,
+                /*.ne32    =*/ne32,
+                /*.ne33    =*/ne33,
+                /*.nb31    =*/nb31,
+                /*.nb32    =*/nb32,
+                /*.nb33    =*/nb33,
+            };
+            wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
+            wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
+            wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src1, 1);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src2, 2);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src3, 3);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_pad,  4);
+            assert(ne12 == ne22);
+            assert(ne13 == ne23);
+            wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ncpsg, std::max(ne12, ne32), std::max(ne13, ne33), 32, 1, 1);
+            need_sync = true;
+        }
+        if (has_mask) {
+            assert(wsp_ggml_metal_op_flash_attn_ext_extra_blk(op) != 0);
+            wsp_ggml_metal_kargs_flash_attn_ext_blk args0 = {
+                /*.ne01 =*/ ne01,
+                /*.ne30 =*/ ne30,
+                /*.ne31 =*/ ne31,
+                /*.ne32 =*/ ne32,
+                /*.ne33 =*/ ne33,
+                /*.nb31 =*/ nb31,
+                /*.nb32 =*/ nb32,
+                /*.nb33 =*/ nb33,
+            };
+            wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
+            wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
+            wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src3, 1);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_blk,  2);
+            const int32_t nblk1 = ((ne01 + nqptg - 1)/nqptg);
+            const int32_t nblk0 = ((ne30 + ncpsg - 1)/ncpsg);
+            wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nblk0, nblk1, ne32*ne33, 32, 1, 1);
+            need_sync = true;
+        }
+        if (need_sync) {
+            wsp_ggml_metal_op_concurrency_reset(ctx);
+        }
         const int is_q = wsp_ggml_is_quantized(op->src[1]->type) ? 1 : 0;
         // 2*(2*ncpsg)
@@ -2007,6 +2403,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
             /*.nb21          =*/ nb21,
             /*.nb22          =*/ nb22,
             /*.nb23          =*/ nb23,
+            /*.ne31          =*/ ne31,
             /*.ne32          =*/ ne32,
             /*.ne33          =*/ ne33,
             /*.nb31          =*/ nb31,
@@ -2023,24 +2420,18 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
             /*.logit_softcap =*/ logit_softcap,
         };
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, nsg);
+        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
         wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
         wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
-        if (op->src[3]) {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[3]), 4);
-        } else {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 4);
-        }
-        if (op->src[4]) {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[4]), 5);
-        } else {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 5);
-        }
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         6);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src2, 3);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src3, 4);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src4, 5);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_pad,  6);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_blk,  7);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  8);
         wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
@@ -2048,14 +2439,60 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
 #undef FATTN_SMEM
     } else {
         // half4x4 kernel
-        const int64_t nqptg = 1;  // queries per threadgroup    !! sync with kernel template arguments !!
-        const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
-        const int64_t nkpsg = 1*ncpsg;
+        const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPTG; // queries per threadgroup
+        const int ncpsg = OP_FLASH_ATTN_EXT_VEC_NCPSG; // cache values per simdgroup !! sync with kernel template arguments !!
+        const int nkpsg = 1*ncpsg;
         WSP_GGML_ASSERT(nqptg <= 32);
         WSP_GGML_ASSERT(nqptg  % 1  == 0);
         WSP_GGML_ASSERT(ncpsg  % 32 == 0);
+        bool need_sync = false;
+        const bool has_kvpad = ne11 % ncpsg != 0;
+        if (has_kvpad) {
+            assert(wsp_ggml_metal_op_flash_attn_ext_extra_pad(op) != 0);
+            wsp_ggml_metal_kargs_flash_attn_ext_pad args0 = {
+                /*.ne11    =*/ne11,
+                /*.ne_12_2 =*/ne12,
+                /*.ne_12_3 =*/ne13,
+                /*.nb11    =*/nb11,
+                /*.nb12    =*/nb12,
+                /*.nb13    =*/nb13,
+                /*.nb21    =*/nb21,
+                /*.nb22    =*/nb22,
+                /*.nb23    =*/nb23,
+                /*.ne31    =*/ne31,
+                /*.ne32    =*/ne32,
+                /*.ne33    =*/ne33,
+                /*.nb31    =*/nb31,
+                /*.nb32    =*/nb32,
+                /*.nb33    =*/nb33,
+            };
+            wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
+            wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
+            wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src1, 1);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src2, 2);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_src3, 3);
+            wsp_ggml_metal_encoder_set_buffer  (enc, bid_pad,  4);
+            assert(ne12 == ne22);
+            assert(ne13 == ne23);
+            wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ncpsg, std::max(ne12, ne32), std::max(ne13, ne33), 32, 1, 1);
+            need_sync = true;
+        }
+        if (need_sync) {
+            wsp_ggml_metal_op_concurrency_reset(ctx);
+        }
         // ne00 + 2*ncpsg*(nsg)
         // for each query, we load it as f16 in shared memory (ne00)
         // and store the soft_max values and the mask
@@ -2120,6 +2557,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
             /*.nb21          =*/ nb21,
             /*.nb22          =*/ nb22,
             /*.nb23          =*/ nb23,
+            /*.ne31          =*/ ne31,
             /*.ne32          =*/ ne32,
             /*.ne33          =*/ ne33,
             /*.nb31          =*/ nb31,
@@ -2136,25 +2574,17 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
             /*.logit_softcap =*/ logit_softcap,
         };
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, nsg, nwg);
+        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
         WSP_GGML_ASSERT(nsg*32 <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
         wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
         wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
-        wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
-        if (op->src[3]) {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[3]), 4);
-        } else {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 4);
-        }
-        if (op->src[4]) {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[4]), 5);
-        } else {
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 5);
-        }
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src2, 3);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src3, 4);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src4, 5);
         const size_t smem = FATTN_SMEM(nsg);
@@ -2162,23 +2592,25 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
         WSP_GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
         if (nwg == 1) {
+            assert(wsp_ggml_metal_op_flash_attn_ext_extra_tmp(op) == 0);
             // using 1 workgroup -> write the result directly into dst
-            wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 6);
+            wsp_ggml_metal_encoder_set_buffer(enc, bid_pad, 6);
+            wsp_ggml_metal_encoder_set_buffer(enc, bid_dst, 7);
             wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
             wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
         } else {
             // sanity checks
+            assert(wsp_ggml_metal_op_flash_attn_ext_extra_tmp(op) != 0);
             WSP_GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3);
             WSP_GGML_ASSERT((uint64_t)ne1*ne2*ne3 <= (1u << 31));
-            wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_get_buffer_id(op);
             // write the results from each workgroup into a temp buffer
-            wsp_ggml_metal_buffer_id bid_tmp = bid_dst;
-            bid_tmp.offs += wsp_ggml_nbytes(op);
-            wsp_ggml_metal_encoder_set_buffer(enc, bid_tmp, 6);
+            wsp_ggml_metal_encoder_set_buffer(enc, bid_pad, 6);
+            wsp_ggml_metal_encoder_set_buffer(enc, bid_tmp, 7);
             wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
             wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
@@ -2385,7 +2817,7 @@ int wsp_ggml_metal_op_l2_norm(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float eps;
     memcpy(&eps, op->op_params, sizeof(float));
@@ -2433,7 +2865,7 @@ int wsp_ggml_metal_op_group_norm(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const int32_t ngrp = ((const int32_t *) op->op_params)[0];
@@ -2488,7 +2920,7 @@ int wsp_ggml_metal_op_norm(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float eps;
     memcpy(&eps, op->op_params, sizeof(float));
@@ -2624,7 +3056,7 @@ int wsp_ggml_metal_op_rope(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     // make sure we have one or more position id(ne10) per token(ne02)
     WSP_GGML_ASSERT(ne10 % ne02 == 0);
@@ -2688,6 +3120,7 @@ int wsp_ggml_metal_op_rope(wsp_ggml_metal_op_t ctx, int idx) {
         /* sect_1      =*/ sect_1,
         /* sect_2      =*/ sect_2,
         /* sect_3      =*/ sect_3,
+        /* src2        =*/ op->src[2] != nullptr,
     };
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_rope(lib, op);
@@ -2717,7 +3150,7 @@ int wsp_ggml_metal_op_im2col(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const int32_t s0 = ((const int32_t *)(op->op_params))[0];
     const int32_t s1 = ((const int32_t *)(op->op_params))[1];
@@ -2778,6 +3211,84 @@ int wsp_ggml_metal_op_im2col(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_conv_2d(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(op->src[0]));
+    WSP_GGML_ASSERT(op->src[1]->type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT(op->type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F16 || op->src[0]->type == WSP_GGML_TYPE_F32);
+    const int32_t s0 = ((const int32_t *) op->op_params)[0];
+    const int32_t s1 = ((const int32_t *) op->op_params)[1];
+    const int32_t p0 = ((const int32_t *) op->op_params)[2];
+    const int32_t p1 = ((const int32_t *) op->op_params)[3];
+    const int32_t d0 = ((const int32_t *) op->op_params)[4];
+    const int32_t d1 = ((const int32_t *) op->op_params)[5];
+    wsp_ggml_metal_kargs_conv_2d args = {
+        /*.nb00 =*/ nb00,
+        /*.nb01 =*/ nb01,
+        /*.nb02 =*/ nb02,
+        /*.nb03 =*/ nb03,
+        /*.nb10 =*/ nb10,
+        /*.nb11 =*/ nb11,
+        /*.nb12 =*/ nb12,
+        /*.nb13 =*/ nb13,
+        /*.nb0  =*/ nb0,
+        /*.nb1  =*/ nb1,
+        /*.nb2  =*/ nb2,
+        /*.nb3  =*/ nb3,
+        /*.IW   =*/ ne10,
+        /*.IH   =*/ ne11,
+        /*.KW   =*/ ne00,
+        /*.KH   =*/ ne01,
+        /*.IC   =*/ ne02,
+        /*.OC   =*/ ne03,
+        /*.OW   =*/ ne0,
+        /*.OH   =*/ ne1,
+        /*.N    =*/ ne3,
+        /*.s0   =*/ s0,
+        /*.s1   =*/ s1,
+        /*.p0   =*/ p0,
+        /*.p1   =*/ p1,
+        /*.d0   =*/ d0,
+        /*.d1   =*/ d1,
+    };
+    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_conv_2d(lib, op);
+    int nth = wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
+    nth = std::min(nth, 256);
+    nth = std::max(nth, 1);
+    const uint64_t n_out = wsp_ggml_nelements(op);
+    uint64_t tg = (n_out + nth - 1)/nth;
+    tg = std::max<uint64_t>(tg, 1);
+    tg = std::min<uint64_t>(tg, (uint64_t) std::numeric_limits<int>::max());
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         3);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, tg, 1, 1, nth, 1, 1);
+    return 1;
+}
 int wsp_ggml_metal_op_conv_transpose_1d(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_tensor * op = ctx->node(idx);
@@ -2789,7 +3300,7 @@ int wsp_ggml_metal_op_conv_transpose_1d(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const int32_t s0 = ((const int32_t *)(op->op_params))[0];
@@ -2823,6 +3334,62 @@ int wsp_ggml_metal_op_conv_transpose_1d(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_conv_transpose_2d(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    const int32_t s0 = ((const int32_t *)(op->op_params))[0];
+    const int32_t IC = op->src[1]->ne[2];
+    const int32_t IH = op->src[1]->ne[1];
+    const int32_t IW = op->src[1]->ne[0];
+    const int32_t KH = op->src[0]->ne[1];
+    const int32_t KW = op->src[0]->ne[0];
+    const int32_t OW = op->ne[0];
+    const int32_t OH = op->ne[1];
+    const int32_t OC = op->ne[2];
+    wsp_ggml_metal_kargs_conv_transpose_2d args = {
+        /*.IC  =*/ IC,
+        /*.IH  =*/ IH,
+        /*.IW  =*/ IW,
+        /*.KH  =*/ KH,
+        /*.KW  =*/ KW,
+        /*.OC  =*/ OC,
+        /*.s0  =*/ s0,
+        /*.nb0 =*/ nb0,
+        /*.nb1 =*/ nb1,
+        /*.nb2 =*/ nb2,
+    };
+    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         3);
+    // Metal requires buffer size to be multiple of 16 bytes
+    const size_t smem = WSP_GGML_PAD(KW * KH * sizeof(float), 16);
+    wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1);
+    return 1;
+}
 int wsp_ggml_metal_op_upscale(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_tensor * op = ctx->node(idx);
@@ -2832,7 +3399,7 @@ int wsp_ggml_metal_op_upscale(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const float sf0 = (float)ne0/op->src[0]->ne[0];
     const float sf1 = (float)ne1/op->src[0]->ne[1];
@@ -2885,7 +3452,7 @@ int wsp_ggml_metal_op_pad(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_kargs_pad args = {
         /*.ne00 =*/ ne00,
@@ -2929,7 +3496,7 @@ int wsp_ggml_metal_op_pad_reflect_1d(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_kargs_pad_reflect_1d args = {
         /*.ne00 =*/ ne00,
@@ -2973,7 +3540,7 @@ int wsp_ggml_metal_op_arange(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_encoder_t enc = ctx->enc;
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float start;
     float step;
@@ -2991,12 +3558,6 @@ int wsp_ggml_metal_op_arange(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_arange(lib, op);
-    //[encoder setComputePipelineState:pipeline];
-    //[encoder setBuffer:id_dst  offset:offs_dst  atIndex:0];
-    //[encoder setBytes:&args length:sizeof(args) atIndex:1];
-    //[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op), 1);
@@ -3015,7 +3576,7 @@ int wsp_ggml_metal_op_timestep_embedding(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     const int dim        = op->op_params[0];
     const int max_period = op->op_params[1];
@@ -3049,7 +3610,7 @@ int wsp_ggml_metal_op_argmax(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     wsp_ggml_metal_kargs_argmax args = {
         /*.ne00 = */ ne00,
@@ -3085,38 +3646,93 @@ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_library_t lib = ctx->lib;
     wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(op->src[0]));
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_argsort(lib, op);
     // bitonic sort requires the number of elements to be power of 2
-    int64_t ne00_padded = 1;
-    while (ne00_padded < ne00) {
-        ne00_padded *= 2;
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
     }
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_argsort(lib, op);
-    const int64_t nrows = wsp_ggml_nrows(op->src[0]);
+    const int npr = (ne00 + nth - 1)/nth;
     // Metal kernels require the buffer size to be multiple of 16 bytes
     // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
-    const size_t smem = WSP_GGML_PAD(ne00_padded*sizeof(int32_t), 16);
+    const size_t smem = WSP_GGML_PAD(nth*sizeof(int32_t), 16);
+    wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
+    wsp_ggml_metal_buffer_id bid_dst  = wsp_ggml_metal_get_buffer_id(op);
+    wsp_ggml_metal_buffer_id bid_tmp = bid_dst;
+    bid_tmp.offs += wsp_ggml_nbytes(op);
+    if ((int) ceil(std::log(npr) / std::log(2)) % 2 == 1) {
+        std::swap(bid_dst, bid_tmp);
+    }
     wsp_ggml_metal_kargs_argsort args = {
-        /*.ncols =*/ ne00,
-        /*.ncols_pad =*/ ne00_padded
+        /*.ne00 =*/ ne00,
+        /*.ne01 =*/ ne01,
+        /*.ne02 =*/ ne02,
+        /*.ne03 =*/ ne03,
+        /*.nb00 =*/ nb00,
+        /*.nb01 =*/ nb01,
+        /*.nb02 =*/ nb02,
+        /*.nb03 =*/ nb03,
     };
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
-    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         2);
+    wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
     wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
-    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, 1, nrows, 1, ne00_padded, 1, 1);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
+    wsp_ggml_metal_pipeline_t pipeline_merge = wsp_ggml_metal_library_get_pipeline_argsort_merge(lib, op);
+    int len = nth;
+    while (len < ne00) {
+        wsp_ggml_metal_op_concurrency_reset(ctx);
+        wsp_ggml_metal_kargs_argsort_merge args_merge = {
+            .ne00 = ne00,
+            .ne01 = ne01,
+            .ne02 = ne02,
+            .ne03 = ne03,
+            .nb00 = nb00,
+            .nb01 = nb01,
+            .nb02 = nb02,
+            .nb03 = nb03,
+            .len  = len,
+        };
+        // merges per row
+        const int nm = (ne00 + 2*len - 1) / (2*len);
+        const int nth = std::min(512, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_merge));
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline_merge);
+        wsp_ggml_metal_encoder_set_bytes   (enc, &args_merge, sizeof(args_merge), 0);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp,  3);
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nm*ne01, ne02, ne03, nth, 1, 1);
+        std::swap(bid_dst, bid_tmp);
+        len <<= 1;
+    }
     return 1;
 }
@@ -3130,7 +3746,7 @@ int wsp_ggml_metal_op_leaky_relu(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
-    WSP_GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
     float slope;
     memcpy(&slope, op->op_params, sizeof(float));
@@ -3156,3 +3772,73 @@ int wsp_ggml_metal_op_leaky_relu(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_opt_step_adamw(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
+    const int64_t np = wsp_ggml_nelements(op->src[0]);
+    wsp_ggml_metal_kargs_opt_step_adamw args = {
+        /*.np =*/ np,
+    };
+    int ida = 0;
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[3]), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[4]), ida++);
+    const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
+    const int64_t n = (np + nth - 1) / nth;
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, nth, 1, 1);
+    return 1;
+}
+int wsp_ggml_metal_op_opt_step_sgd(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
+    const int64_t np = wsp_ggml_nelements(op->src[0]);
+    wsp_ggml_metal_kargs_opt_step_sgd args = {
+        /*.np =*/ np,
+    };
+    int ida = 0;
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), ida++);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), ida++);
+    const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
+    const int64_t n = (np + nth - 1) / nth;
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, nth, 1, 1);
+    return 1;
+}