npm - whisper.rn - Versions diffs - 0.5.4 → 0.5.5 - Mend

whisper.rn 0.5.4 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/cpp/ggml-metal/ggml-metal-ops.cpp CHANGED Viewed

@@ -221,7 +221,7 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
         }
         if (ctx->debug_graph > 0) {
-            WSP_GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, wsp_ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
+            WSP_GGML_LOG_DEBUG("%s: node[%5d] - %-12s %-12s %s\n", __func__, idx, wsp_ggml_op_name(node->op), wsp_ggml_get_name(node), is_concurrent ? "(concurrent)" : "");
         }
         if (ctx->debug_graph > 1) {
             WSP_GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
@@ -286,6 +286,10 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = wsp_ggml_metal_op_scale(ctx, idx);
             } break;
+        case WSP_GGML_OP_FILL:
+            {
+                n_fuse = wsp_ggml_metal_op_fill(ctx, idx);
+            } break;
         case WSP_GGML_OP_CLAMP:
             {
                 n_fuse = wsp_ggml_metal_op_clamp(ctx, idx);
@@ -406,10 +410,18 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = wsp_ggml_metal_op_argsort(ctx, idx);
             } break;
+        case WSP_GGML_OP_TOP_K:
+            {
+                n_fuse = wsp_ggml_metal_op_top_k(ctx, idx);
+            } break;
         case WSP_GGML_OP_LEAKY_RELU:
             {
                 n_fuse = wsp_ggml_metal_op_leaky_relu(ctx, idx);
             } break;
+        case WSP_GGML_OP_TRI:
+            {
+                n_fuse = wsp_ggml_metal_op_tri(ctx, idx);
+            } break;
         case WSP_GGML_OP_FLASH_ATTN_EXT:
             {
                 n_fuse = wsp_ggml_metal_op_flash_attn_ext(ctx, idx);
@@ -436,7 +448,11 @@ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = wsp_ggml_metal_op_opt_step_sgd(ctx, idx);
             } break;
-       default:
+        case WSP_GGML_OP_COUNT_EQUAL:
+            {
+                n_fuse = wsp_ggml_metal_op_count_equal(ctx, idx);
+            } break;
+        default:
             {
                 WSP_GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, wsp_ggml_op_name(node->op));
                 WSP_GGML_ABORT("fatal error");
@@ -520,7 +536,7 @@ int wsp_ggml_metal_op_concat(wsp_ggml_metal_op_t ctx, int idx) {
         /*.dim  =*/ dim,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_base(lib, WSP_GGML_OP_CONCAT);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_base(lib, WSP_GGML_OP_CONCAT);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -546,7 +562,7 @@ int wsp_ggml_metal_op_repeat(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_repeat(lib, op->type);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_repeat(lib, op->type);
     wsp_ggml_metal_kargs_repeat args = {
         /*.ne00 =*/ ne00,
@@ -612,7 +628,7 @@ int wsp_ggml_metal_op_acc(wsp_ggml_metal_op_t ctx, int idx) {
         // TODO: make a simpler cpy_bytes kernel
         //const id<MTLComputePipelineState> pipeline = ctx->pipelines[WSP_GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
         wsp_ggml_metal_kargs_cpy args = {
             /*.nk0  =*/ ne00,
@@ -675,7 +691,7 @@ int wsp_ggml_metal_op_acc(wsp_ggml_metal_op_t ctx, int idx) {
         /*.o1   =*/ { 0 },
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_bin(lib, WSP_GGML_OP_ADD, 1, false);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_bin(lib, WSP_GGML_OP_ADD, 1, false);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -717,7 +733,42 @@ int wsp_ggml_metal_op_scale(wsp_ggml_metal_op_t ctx, int idx) {
         n /= 4;
     }
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         2);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
+    return 1;
+}
+int wsp_ggml_metal_op_fill(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    const float val = wsp_ggml_get_op_params_f32(op, 0);
+    wsp_ggml_metal_kargs_fill args = {
+        /*.val =*/ val
+    };
+    int64_t n = wsp_ggml_nelements(op);
+    if (n % 4 == 0) {
+        n /= 4;
+    }
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -756,7 +807,7 @@ int wsp_ggml_metal_op_clamp(wsp_ggml_metal_op_t ctx, int idx) {
         n /= 4;
     }
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -785,7 +836,7 @@ int wsp_ggml_metal_op_unary(wsp_ggml_metal_op_t ctx, int idx) {
         n /= 4;
     }
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 0);
@@ -813,7 +864,7 @@ int wsp_ggml_metal_op_glu(wsp_ggml_metal_op_t ctx, int idx) {
         WSP_GGML_ASSERT(wsp_ggml_are_same_shape(op->src[0], op->src[1]));
     }
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_glu(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_glu(lib, op);
     const int32_t swp = wsp_ggml_get_op_params_i32(op, 1);
     const float alpha = wsp_ggml_get_op_params_f32(op, 2);
@@ -866,7 +917,7 @@ int wsp_ggml_metal_op_sum(wsp_ggml_metal_op_t ctx, int idx) {
         /*.np =*/ n,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_sum(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_sum(lib, op);
     int nth = 32; // SIMD width
@@ -921,7 +972,7 @@ int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb3  =*/ nb3,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_sum_rows(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_sum_rows(lib, op);
     int nth = 32; // SIMD width
@@ -932,7 +983,7 @@ int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
     nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
     nth = std::min(nth, ne00);
-    const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -959,7 +1010,7 @@ int wsp_ggml_metal_op_cumsum(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline_blk = wsp_ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
+    auto pipeline_blk = wsp_ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
     int nth = 1;
     while (nth < ne00 && 2*nth <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
@@ -1056,7 +1107,7 @@ int wsp_ggml_metal_op_cumsum(wsp_ggml_metal_op_t ctx, int idx) {
         wsp_ggml_metal_op_concurrency_reset(ctx);
         {
-            wsp_ggml_metal_pipeline_t pipeline_add = wsp_ggml_metal_library_get_pipeline_cumsum_add(lib, op);
+            auto pipeline_add = wsp_ggml_metal_library_get_pipeline_cumsum_add(lib, op);
             wsp_ggml_metal_kargs_cumsum_add args = {
                 /*.ne00 =*/ ne00,
@@ -1102,7 +1153,7 @@ int wsp_ggml_metal_op_get_rows(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
     wsp_ggml_metal_kargs_get_rows args = {
         /*.ne00t =*/ wsp_ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
@@ -1147,7 +1198,7 @@ int wsp_ggml_metal_op_set_rows(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
     const int32_t nk0 = ne0/wsp_ggml_blck_size(op->type);
@@ -1248,7 +1299,7 @@ int wsp_ggml_metal_op_soft_max(wsp_ggml_metal_op_t ctx, int idx) {
         /*.n_head_log2 =*/ n_head_log2,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_soft_max(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_soft_max(lib, op);
     int nth = 32; // SIMD width
@@ -1262,7 +1313,7 @@ int wsp_ggml_metal_op_soft_max(wsp_ggml_metal_op_t ctx, int idx) {
         }
     }
-    const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
@@ -1318,15 +1369,43 @@ int wsp_ggml_metal_op_ssm_conv(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb2  =*/ nb2,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_ssm_conv(lib, op);
+    // Use batched kernel for prefill (ne1 > 1) to reduce threadgroup dispatch overhead
+    const bool use_batched = (ne1 > 1);
-    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
-    wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
-    wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
-    wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
-    wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op),         3);
+    if (use_batched) {
+        // Determine the smallest power of 2 that's >= ne1, but <= 256
+        int BATCH_SIZE;
+        if      (ne1 > 128) BATCH_SIZE = 256;
+        else if (ne1 > 64 ) BATCH_SIZE = 128;
+        else if (ne1 > 32 ) BATCH_SIZE = 64;
+        else if (ne1 > 16 ) BATCH_SIZE = 32;
+        else if (ne1 > 8  ) BATCH_SIZE = 16;
+        else if (ne1 > 4  ) BATCH_SIZE = 8;
+        else                BATCH_SIZE = 2;
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_ssm_conv_batched(lib, op, BATCH_SIZE);
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+        wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op),         3);
-    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
+        // Dispatch: ne01 rows, ceil(ne1/BATCH_SIZE) token batches, ne02 sequences
+        // Each threadgroup has BATCH_SIZE threads, each handling one token
+        const int n_token_batches = (ne1 + BATCH_SIZE - 1) / BATCH_SIZE;
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, n_token_batches, ne02, BATCH_SIZE, 1, 1);
+    } else {
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_ssm_conv(lib, op);
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+        wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op),         3);
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
+    }
     return 1;
 }
@@ -1405,11 +1484,11 @@ int wsp_ggml_metal_op_ssm_scan(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb0          =*/ nb0,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_ssm_scan(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_ssm_scan(lib, op);
     WSP_GGML_ASSERT(d_state <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
-    const size_t sms = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -1422,7 +1501,7 @@ int wsp_ggml_metal_op_ssm_scan(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[6]), 7);
     wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         8);
-    wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
+    wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
     wsp_ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
@@ -1445,7 +1524,7 @@ int wsp_ggml_metal_op_rwkv(wsp_ggml_metal_op_t ctx, int idx) {
     const int64_t C = op->ne[0];
     const int64_t H = op->src[0]->ne[1];
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_rwkv(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_rwkv(lib, op);
     int ida = 0;
@@ -1481,7 +1560,7 @@ int wsp_ggml_metal_op_cpy(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
     WSP_GGML_ASSERT(ne00 % wsp_ggml_blck_size(op->src[0]->type) == 0);
@@ -1588,7 +1667,7 @@ int wsp_ggml_metal_op_pool_2d(wsp_ggml_metal_op_t ctx, int idx) {
         /* .np = */ np
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
     const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
     const int ntg = (np + nth - 1) / nth;
@@ -1697,7 +1776,7 @@ int wsp_ggml_metal_op_mul_mat(wsp_ggml_metal_op_t ctx, int idx) {
                 WSP_GGML_ABORT("unsupported ne11");
         };
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
         wsp_ggml_metal_kargs_mul_mv_ext args = {
             /*.ne00  =*/ ne00,
@@ -1744,7 +1823,7 @@ int wsp_ggml_metal_op_mul_mat(wsp_ggml_metal_op_t ctx, int idx) {
         //    default: break;
         //}
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm(lib, op);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm(lib, op);
         wsp_ggml_metal_kargs_mul_mm args = {
             /*.ne00 =*/ ne00,
@@ -1769,18 +1848,18 @@ int wsp_ggml_metal_op_mul_mat(wsp_ggml_metal_op_t ctx, int idx) {
         wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
         wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         3);
-        const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+        const size_t smem = pipeline.smem;
         wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
         wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
     } else {
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv(lib, op);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv(lib, op);
-        const int nr0 = wsp_ggml_metal_pipeline_get_nr0(pipeline);
-        const int nr1 = wsp_ggml_metal_pipeline_get_nr1(pipeline);
-        const int nsg = wsp_ggml_metal_pipeline_get_nsg(pipeline);
+        const int nr0 = pipeline.nr0;
+        const int nr1 = pipeline.nr1;
+        const int nsg = pipeline.nsg;
-        const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+        const size_t smem = pipeline.smem;
         wsp_ggml_metal_kargs_mul_mv args = {
             /*.ne00 =*/ ne00,
@@ -1911,9 +1990,9 @@ int wsp_ggml_metal_op_mul_mat_id(wsp_ggml_metal_op_t ctx, int idx) {
                 nb21,
             };
-            wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
+            auto pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
-            const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+            const size_t smem = pipeline.smem;
             WSP_GGML_ASSERT(ne02 <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@@ -1934,7 +2013,7 @@ int wsp_ggml_metal_op_mul_mat_id(wsp_ggml_metal_op_t ctx, int idx) {
         wsp_ggml_metal_op_concurrency_reset(ctx);
         {
-            wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
+            auto pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
             wsp_ggml_metal_kargs_mul_mm_id args = {
                 /*.ne00  =*/ ne00,
@@ -1963,20 +2042,20 @@ int wsp_ggml_metal_op_mul_mat_id(wsp_ggml_metal_op_t ctx, int idx) {
             wsp_ggml_metal_encoder_set_buffer  (enc, bid_ids,  4);
             wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  5);
-            const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+            const size_t smem = pipeline.smem;
             wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
             wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
         }
     } else {
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
-        const int nr0 = wsp_ggml_metal_pipeline_get_nr0(pipeline);
-        const int nr1 = wsp_ggml_metal_pipeline_get_nr1(pipeline);
-        const int nsg = wsp_ggml_metal_pipeline_get_nsg(pipeline);
+        const int nr0 = pipeline.nr0;
+        const int nr1 = pipeline.nr1;
+        const int nsg = pipeline.nsg;
-        const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+        const size_t smem = pipeline.smem;
         wsp_ggml_metal_kargs_mul_mv_id args = {
             /*.nei0 =*/ ne20,
@@ -2060,7 +2139,7 @@ int wsp_ggml_metal_op_add_id(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb21 =*/ nb21,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_base(lib, WSP_GGML_OP_ADD_ID);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_base(lib, WSP_GGML_OP_ADD_ID);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -2102,7 +2181,11 @@ size_t wsp_ggml_metal_op_flash_attn_ext_extra_pad(const wsp_ggml_tensor * op) {
     const bool has_mask = op->src[3] != nullptr;
-    if (wsp_ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    // note: the non-vec kernel requires more extra memory, so always reserve for it
+    WSP_GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG);
+    //if (wsp_ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    if (false) {
         // note: always reserve the padding space to avoid graph reallocations
         //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
         const bool has_kvpad = true;
@@ -2304,7 +2387,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
                 /*.nb33    =*/nb33,
             };
-            wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
+            auto pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
             wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
             wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2335,7 +2418,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
                 /*.nb33 =*/ nb33,
             };
-            wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
+            auto pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
             wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
             wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2420,7 +2503,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
             /*.logit_softcap =*/ logit_softcap,
         };
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
         wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
         wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -2472,7 +2555,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
                 /*.nb33    =*/nb33,
             };
-            wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
+            auto pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
             wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
             wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2574,7 +2657,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
             /*.logit_softcap =*/ logit_softcap,
         };
-        wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
         WSP_GGML_ASSERT(nsg*32 <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@@ -2626,7 +2709,7 @@ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
                     nrows,
                 };
-                wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
+                auto pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
                 wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
                 wsp_ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2758,7 +2841,7 @@ int wsp_ggml_metal_op_bin(wsp_ggml_metal_op_t ctx, int idx) {
     // the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
     bid_src1.offs = 0;
-    wsp_ggml_metal_pipeline_t pipeline = nullptr;
+    struct wsp_ggml_metal_pipeline_with_params pipeline;
     if (wsp_ggml_nelements(op->src[1]) == ne10 && wsp_ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
         WSP_GGML_ASSERT(wsp_ggml_is_contiguous(op->src[0]));
@@ -2831,7 +2914,7 @@ int wsp_ggml_metal_op_l2_norm(wsp_ggml_metal_op_t ctx, int idx) {
         /*.eps    =*/ eps,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_l2_norm(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_l2_norm(lib, op);
     while (nth < ne00/4 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
         nth *= 2;
@@ -2840,7 +2923,7 @@ int wsp_ggml_metal_op_l2_norm(wsp_ggml_metal_op_t ctx, int idx) {
     nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
     nth = std::min(nth, ne00/4);
-    const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     const int64_t nrows = wsp_ggml_nrows(op->src[0]);
@@ -2883,7 +2966,7 @@ int wsp_ggml_metal_op_group_norm(wsp_ggml_metal_op_t ctx, int idx) {
         /*.eps  =*/ eps,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_group_norm(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_group_norm(lib, op);
     int nth = 32; // SIMD width
     //while (nth < ne00/4 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
@@ -2893,7 +2976,7 @@ int wsp_ggml_metal_op_group_norm(wsp_ggml_metal_op_t ctx, int idx) {
     //nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
     //nth = std::min(nth, ne00/4);
-    const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3018,7 +3101,7 @@ int wsp_ggml_metal_op_norm(wsp_ggml_metal_op_t ctx, int idx) {
         }
     }
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
     int nth = 32; // SIMD width
@@ -3029,7 +3112,7 @@ int wsp_ggml_metal_op_norm(wsp_ggml_metal_op_t ctx, int idx) {
     nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
     nth = std::min(nth, args.ne00_t);
-    const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3123,7 +3206,7 @@ int wsp_ggml_metal_op_rope(wsp_ggml_metal_op_t ctx, int idx) {
         /* src2        =*/ op->src[2] != nullptr,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_rope(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_rope(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3195,7 +3278,7 @@ int wsp_ggml_metal_op_im2col(wsp_ggml_metal_op_t ctx, int idx) {
         /*.KHW  =*/ KH * KW,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_im2col(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_im2col(lib, op);
     WSP_GGML_ASSERT(KH*KW <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@@ -3266,7 +3349,7 @@ int wsp_ggml_metal_op_conv_2d(wsp_ggml_metal_op_t ctx, int idx) {
         /*.d1   =*/ d1,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_conv_2d(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_conv_2d(lib, op);
     int nth = wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
     nth = std::min(nth, 256);
@@ -3321,7 +3404,7 @@ int wsp_ggml_metal_op_conv_transpose_1d(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb1 =*/ nb1,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3373,7 +3456,7 @@ int wsp_ggml_metal_op_conv_transpose_2d(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb2 =*/ nb2,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3429,7 +3512,7 @@ int wsp_ggml_metal_op_upscale(wsp_ggml_metal_op_t ctx, int idx) {
         /*.sf3 =*/ sf3
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_upscale(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_upscale(lib, op);
     const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
@@ -3473,7 +3556,7 @@ int wsp_ggml_metal_op_pad(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb3  =*/ nb3
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_pad(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_pad(lib, op);
     const int nth = std::min(1024, ne0);
@@ -3519,7 +3602,7 @@ int wsp_ggml_metal_op_pad_reflect_1d(wsp_ggml_metal_op_t ctx, int idx) {
         /*.p1 =*/ ((const int32_t *)(op->op_params))[1]
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
     const int nth = std::min(1024, ne0);
@@ -3556,7 +3639,7 @@ int wsp_ggml_metal_op_arange(wsp_ggml_metal_op_t ctx, int idx) {
     const int nth = std::min(1024, ne0);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_arange(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_arange(lib, op);
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3587,7 +3670,7 @@ int wsp_ggml_metal_op_timestep_embedding(wsp_ggml_metal_op_t ctx, int idx) {
         /*.max_period =*/ max_period,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
     const int nth = std::max(1, std::min(1024, dim/2));
@@ -3617,7 +3700,7 @@ int wsp_ggml_metal_op_argmax(wsp_ggml_metal_op_t ctx, int idx) {
         /*.nb01 = */ nb01,
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_argmax(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_argmax(lib, op);
     const int64_t nrows = wsp_ggml_nrows(op->src[0]);
@@ -3626,7 +3709,7 @@ int wsp_ggml_metal_op_argmax(wsp_ggml_metal_op_t ctx, int idx) {
         nth *= 2;
     }
-    const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
     wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3653,7 +3736,7 @@ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_argsort(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_argsort(lib, op);
     // bitonic sort requires the number of elements to be power of 2
     int nth = 1;
@@ -3678,14 +3761,19 @@ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
     }
     wsp_ggml_metal_kargs_argsort args = {
-        /*.ne00 =*/ ne00,
-        /*.ne01 =*/ ne01,
-        /*.ne02 =*/ ne02,
-        /*.ne03 =*/ ne03,
-        /*.nb00 =*/ nb00,
-        /*.nb01 =*/ nb01,
-        /*.nb02 =*/ nb02,
-        /*.nb03 =*/ nb03,
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.top_k =*/ nth,
     };
     wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
@@ -3697,7 +3785,7 @@ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
-    wsp_ggml_metal_pipeline_t pipeline_merge = wsp_ggml_metal_library_get_pipeline_argsort_merge(lib, op);
+    auto pipeline_merge = wsp_ggml_metal_library_get_pipeline_argsort_merge(lib, op);
     int len = nth;
@@ -3705,15 +3793,20 @@ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
         wsp_ggml_metal_op_concurrency_reset(ctx);
         wsp_ggml_metal_kargs_argsort_merge args_merge = {
-            .ne00 = ne00,
-            .ne01 = ne01,
-            .ne02 = ne02,
-            .ne03 = ne03,
-            .nb00 = nb00,
-            .nb01 = nb01,
-            .nb02 = nb02,
-            .nb03 = nb03,
-            .len  = len,
+            /*.ne00  =*/ ne00,
+            /*.ne01  =*/ ne01,
+            /*.ne02  =*/ ne02,
+            /*.ne03  =*/ ne03,
+            /*.nb00  =*/ nb00,
+            /*.nb01  =*/ nb01,
+            /*.nb02  =*/ nb02,
+            /*.nb03  =*/ nb03,
+            /*.ne0   =*/ ne0,
+            /*.ne1   =*/ ne1,
+            /*.ne2   =*/ ne2,
+            /*.ne3   =*/ ne3,
+            /*.top_k =*/ ne00,
+            /*.len   =*/ len,
         };
         // merges per row
@@ -3737,6 +3830,118 @@ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_top_k(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(op->src[0]));
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_top_k(lib, op);
+    // bitonic sort requires the number of elements to be power of 2
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+    // blocks per row
+    const int npr = (ne00 + nth - 1)/nth;
+    const size_t smem = WSP_GGML_PAD(nth*sizeof(int32_t), 16);
+    wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
+    wsp_ggml_metal_buffer_id bid_dst  = wsp_ggml_metal_get_buffer_id(op);
+    wsp_ggml_metal_buffer_id bid_tmp = bid_dst;
+    bid_tmp.offs += sizeof(int32_t)*wsp_ggml_nelements(op->src[0]);
+    if ((int) ceil(std::log(npr) / std::log(2)) % 2 == 1) {
+        std::swap(bid_dst, bid_tmp);
+    }
+    const int top_k = ne0;
+    wsp_ggml_metal_kargs_argsort args = {
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.top_k =*/ std::min(nth, top_k), // for each block, keep just the top_k indices
+    };
+    if (npr > 1) {
+        args.ne0 = (npr - 1)*args.top_k + std::min(ne00 - (npr - 1)*nth, args.top_k);
+    }
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+    wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
+    auto pipeline_merge = wsp_ggml_metal_library_get_pipeline_top_k_merge(lib, op);
+    int len = args.top_k;
+    while (len < args.ne0) {
+        wsp_ggml_metal_op_concurrency_reset(ctx);
+        // merges per row
+        const int nm = (args.ne0 + 2*len - 1) / (2*len);
+        const int nth = std::min(512, std::min(len, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_merge)));
+        wsp_ggml_metal_kargs_argsort_merge args_merge = {
+            /*.ne00  =*/ ne00,
+            /*.ne01  =*/ ne01,
+            /*.ne02  =*/ ne02,
+            /*.ne03  =*/ ne03,
+            /*.nb00  =*/ nb00,
+            /*.nb01  =*/ nb01,
+            /*.nb02  =*/ nb02,
+            /*.nb03  =*/ nb03,
+            /*.ne0   =*/ args.ne0,
+            /*.ne1   =*/ ne1,
+            /*.ne2   =*/ ne2,
+            /*.ne3   =*/ ne3,
+            /*.top_k =*/ nm == 1 ? top_k : args.ne0, // the final merge outputs top_k elements
+            /*.len   =*/ len,
+        };
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline_merge);
+        wsp_ggml_metal_encoder_set_bytes   (enc, &args_merge, sizeof(args_merge), 0);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+        wsp_ggml_metal_encoder_set_buffer  (enc, bid_tmp,  3);
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nm*ne01, ne02, ne03, nth, 1, 1);
+        std::swap(bid_dst, bid_tmp);
+        len <<= 1;
+    }
+    return 1;
+}
 int wsp_ggml_metal_op_leaky_relu(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_tensor * op = ctx->node(idx);
@@ -3755,7 +3960,7 @@ int wsp_ggml_metal_op_leaky_relu(wsp_ggml_metal_op_t ctx, int idx) {
         /*.slope =*/ slope
     };
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
     int64_t n = wsp_ggml_nelements(op);
@@ -3773,6 +3978,57 @@ int wsp_ggml_metal_op_leaky_relu(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_tri(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+    wsp_ggml_metal_kargs_tri args = {
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.nb0   =*/ nb0,
+        /*.nb1   =*/ nb1,
+        /*.nb2   =*/ nb2,
+        /*.nb3   =*/ nb3,
+    };
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_tri(lib, op);
+    int nth = 32; // SIMD width
+    while (nth < ne00 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+    nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    nth = std::min(nth, ne00);
+    wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+    wsp_ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+    wsp_ggml_metal_encoder_set_buffer  (enc, wsp_ggml_metal_get_buffer_id(op),         2);
+    wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+    return 1;
+}
 int wsp_ggml_metal_op_opt_step_adamw(wsp_ggml_metal_op_t ctx, int idx) {
     wsp_ggml_tensor * op = ctx->node(idx);
@@ -3784,7 +4040,7 @@ int wsp_ggml_metal_op_opt_step_adamw(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
     const int64_t np = wsp_ggml_nelements(op->src[0]);
     wsp_ggml_metal_kargs_opt_step_adamw args = {
@@ -3820,7 +4076,7 @@ int wsp_ggml_metal_op_opt_step_sgd(wsp_ggml_metal_op_t ctx, int idx) {
     WSP_GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     WSP_GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
-    wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
+    auto pipeline = wsp_ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
     const int64_t np = wsp_ggml_nelements(op->src[0]);
     wsp_ggml_metal_kargs_opt_step_sgd args = {
@@ -3842,3 +4098,64 @@ int wsp_ggml_metal_op_opt_step_sgd(wsp_ggml_metal_op_t ctx, int idx) {
     return 1;
 }
+int wsp_ggml_metal_op_count_equal(wsp_ggml_metal_op_t ctx, int idx) {
+    wsp_ggml_tensor * op = ctx->node(idx);
+    wsp_ggml_metal_library_t lib = ctx->lib;
+    wsp_ggml_metal_encoder_t enc = ctx->enc;
+    WSP_GGML_TENSOR_LOCALS(int32_t,  ne0, op->src[0], ne);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    {
+        wsp_ggml_metal_kargs_memset args = { /*.val =*/ 0 };
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_memset(lib, op);
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+        wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 1);
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1);
+    }
+    wsp_ggml_metal_op_concurrency_reset(ctx);
+    {
+        wsp_ggml_metal_kargs_count_equal args = {
+            /*.ne00 =*/ ne00,
+            /*.ne01 =*/ ne01,
+            /*.ne02 =*/ ne02,
+            /*.ne03 =*/ ne03,
+            /*.nb00 =*/ nb00,
+            /*.nb01 =*/ nb01,
+            /*.nb02 =*/ nb02,
+            /*.nb03 =*/ nb03,
+            /*.nb10 =*/ nb10,
+            /*.nb11 =*/ nb11,
+            /*.nb12 =*/ nb12,
+            /*.nb13 =*/ nb13,
+        };
+        auto pipeline = wsp_ggml_metal_library_get_pipeline_count_equal(lib, op);
+        const size_t smem = pipeline.smem;
+        const int nth = 32*pipeline.nsg;
+        WSP_GGML_ASSERT(nth <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+        wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
+        wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
+        wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 3);
+        wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+        wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+    }
+    return 1;
+}