npm - whisper.rn - Versions diffs - 0.5.0-rc.8 → 0.5.0 - Mend

whisper.rn 0.5.0-rc.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/cpp/ggml-metal.m CHANGED Viewed

@@ -55,6 +55,12 @@ static struct wsp_ggml_backend_metal_device_context {
     bool has_residency_sets;
     bool has_bfloat;
     bool use_bfloat;
+    bool use_fusion;
+    int debug_fusion;
+    // how many times a given op was fused
+    uint64_t fuse_cnt[WSP_GGML_OP_COUNT];
     size_t max_size;
@@ -69,6 +75,9 @@ static struct wsp_ggml_backend_metal_device_context {
     /*.has_residency_sets      =*/ false,
     /*.has_bfloat              =*/ false,
     /*.use_bfloat              =*/ false,
+    /*.use_fusion              =*/ true,
+    /*.debug_fusion            =*/ 0,
+    /*.fuse_cnt                =*/ { 0 },
     /*.max_size                =*/ 0,
     /*.name                    =*/ "",
 };
@@ -83,16 +92,14 @@ static id<MTLDevice> wsp_ggml_backend_metal_device_acq(struct wsp_ggml_backend_m
     if (ctx->mtl_device == nil) {
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
-    }
-    if (ctx->mtl_device) {
         ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
         ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
         ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
 #if defined(WSP_GGML_METAL_HAS_RESIDENCY_SETS)
-        ctx->has_residency_sets = getenv("WSP_GGML_METAL_NO_RESIDENCY") == NULL;
+        ctx->has_residency_sets = getenv("WSP_GGML_METAL_NO_RESIDENCY") == nil;
 #endif
         ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
@@ -103,6 +110,14 @@ static id<MTLDevice> wsp_ggml_backend_metal_device_acq(struct wsp_ggml_backend_m
 #else
         ctx->use_bfloat = false;
 #endif
+        ctx->use_fusion = getenv("WSP_GGML_METAL_FUSION_DISABLE") == nil;
+        {
+            const char * val = getenv("WSP_GGML_METAL_FUSION_DEBUG");
+            ctx->debug_fusion = val ? atoi(val) : 0;
+        }
+        memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt));
         ctx->max_size = ctx->mtl_device.maxBufferLength;
@@ -122,6 +137,18 @@ static void wsp_ggml_backend_metal_device_rel(struct wsp_ggml_backend_metal_devi
     ctx->mtl_device_ref_count--;
     if (ctx->mtl_device_ref_count == 0) {
+        if (ctx->debug_fusion > 0) {
+            fprintf(stderr, "%s: fusion stats:\n", __func__);
+            for (int i = 0; i < WSP_GGML_OP_COUNT; i++) {
+                if (ctx->fuse_cnt[i] == 0) {
+                    continue;
+                }
+                // note: cannot use wsp_ggml_log here
+                fprintf(stderr, "%s: - %s: %" PRIu64 "\n", __func__, wsp_ggml_op_name((enum wsp_ggml_op) i), ctx->fuse_cnt[i]);
+            }
+        }
         if (ctx->mtl_lock) {
             [ctx->mtl_lock release];
             ctx->mtl_lock = nil;
@@ -147,13 +174,28 @@ struct wsp_ggml_metal_kernel {
 enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_ADD,
-    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_2,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_3,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_4,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_5,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_6,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_7,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_8,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8,
     WSP_GGML_METAL_KERNEL_TYPE_SUB,
-    WSP_GGML_METAL_KERNEL_TYPE_SUB_ROW,
+    WSP_GGML_METAL_KERNEL_TYPE_SUB_ROW_C4,
     WSP_GGML_METAL_KERNEL_TYPE_MUL,
-    WSP_GGML_METAL_KERNEL_TYPE_MUL_ROW,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_ROW_C4,
     WSP_GGML_METAL_KERNEL_TYPE_DIV,
-    WSP_GGML_METAL_KERNEL_TYPE_DIV_ROW,
+    WSP_GGML_METAL_KERNEL_TYPE_DIV_ROW_C4,
+    WSP_GGML_METAL_KERNEL_TYPE_ADD_ID,
     WSP_GGML_METAL_KERNEL_TYPE_REPEAT_F32,
     WSP_GGML_METAL_KERNEL_TYPE_REPEAT_F16,
     WSP_GGML_METAL_KERNEL_TYPE_REPEAT_I32,
@@ -173,6 +215,12 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_SILU,
     WSP_GGML_METAL_KERNEL_TYPE_SILU_4,
     WSP_GGML_METAL_KERNEL_TYPE_ELU,
+    WSP_GGML_METAL_KERNEL_TYPE_ABS,
+    WSP_GGML_METAL_KERNEL_TYPE_SGN,
+    WSP_GGML_METAL_KERNEL_TYPE_STEP,
+    WSP_GGML_METAL_KERNEL_TYPE_HARDSWISH,
+    WSP_GGML_METAL_KERNEL_TYPE_HARDSIGMOID,
+    WSP_GGML_METAL_KERNEL_TYPE_EXP,
     WSP_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
     WSP_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
     WSP_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
@@ -187,6 +235,7 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,
     WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,
     WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,
+    WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_MXFP4,
     WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,
     WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,
     WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,
@@ -212,11 +261,14 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,
     WSP_GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,
     WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM,
+    WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL,
+    WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD,
     WSP_GGML_METAL_KERNEL_TYPE_L2_NORM,
     WSP_GGML_METAL_KERNEL_TYPE_GROUP_NORM,
     WSP_GGML_METAL_KERNEL_TYPE_NORM,
     WSP_GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
     WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
+    WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,
     WSP_GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,
     WSP_GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
@@ -236,6 +288,7 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
@@ -260,6 +313,10 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_2,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_3,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_4,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_5,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,
@@ -301,6 +358,7 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,
@@ -323,6 +381,7 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,
@@ -347,6 +406,7 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,
+    WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,
     WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,
@@ -529,6 +589,9 @@ enum wsp_ggml_metal_kernel_type {
     WSP_GGML_METAL_KERNEL_TYPE_REGLU,
     WSP_GGML_METAL_KERNEL_TYPE_GEGLU,
     WSP_GGML_METAL_KERNEL_TYPE_SWIGLU,
+    WSP_GGML_METAL_KERNEL_TYPE_SWIGLU_OAI,
+    WSP_GGML_METAL_KERNEL_TYPE_GEGLU_ERF,
+    WSP_GGML_METAL_KERNEL_TYPE_GEGLU_QUICK,
     WSP_GGML_METAL_KERNEL_TYPE_SUM_ROWS,
     WSP_GGML_METAL_KERNEL_TYPE_MEAN,
     WSP_GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
@@ -1130,13 +1193,28 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         // simd_sum and simd_max requires MTLGPUFamilyApple7
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD,                             add,                             true);
-        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW,                         add_row,                         true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_2,                      add_fuse_2,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_3,                      add_fuse_3,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_4,                      add_fuse_4,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_5,                      add_fuse_5,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_6,                      add_fuse_6,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_7,                      add_fuse_7,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_8,                      add_fuse_8,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4,                      add_row_c4,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2,               add_row_c4_fuse_2,               true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3,               add_row_c4_fuse_3,               true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4,               add_row_c4_fuse_4,               true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5,               add_row_c4_fuse_5,               true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6,               add_row_c4_fuse_6,               true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7,               add_row_c4_fuse_7,               true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8,               add_row_c4_fuse_8,               true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SUB,                             sub,                             true);
-        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SUB_ROW,                         sub_row,                         true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SUB_ROW_C4,                      sub_row_c4,                      true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL,                             mul,                             true);
-        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_ROW,                         mul_row,                         true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_ROW_C4,                      mul_row_c4,                      true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_DIV,                             div,                             true);
-        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_DIV_ROW,                         div_row,                         true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_DIV_ROW_C4,                      div_row_c4,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ADD_ID,                          add_id,                          true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_REPEAT_F32,                      repeat_f32,                      true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_REPEAT_F16,                      repeat_f16,                      true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_REPEAT_I32,                      repeat_i32,                      true);
@@ -1156,6 +1234,12 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SILU,                            silu,                            true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SILU_4,                          silu_4,                          true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ELU,                             elu,                             true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ABS,                             abs,                             true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SGN,                             sgn,                             true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_STEP,                            step,                            true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_HARDSWISH,                       hardswish,                       true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_HARDSIGMOID,                     hardsigmoid,                     true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_EXP,                             exp,                             true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                    soft_max_f16,                    has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                  soft_max_f16_4,                  has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                    soft_max_f32,                    has_simdgroup_reduction);
@@ -1170,6 +1254,7 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,                   get_rows_q5_0,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,                   get_rows_q5_1,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,                   get_rows_q8_0,                   true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_MXFP4,                  get_rows_mxfp4,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,                   get_rows_q2_K,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,                   get_rows_q3_K,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,                   get_rows_q4_K,                   true);
@@ -1195,11 +1280,14 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,                   set_rows_q5_1,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,                 set_rows_iq4_nl,                 true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM,                        rms_norm,                        has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL,                    rms_norm_mul,                    has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD,                rms_norm_mul_add,                has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_L2_NORM,                         l2_norm,                         has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GROUP_NORM,                      group_norm,                      has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_NORM,                            norm,                            true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                    ssm_conv_f32,                    true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                    ssm_scan_f32,                    true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,              ssm_scan_f32_group,              true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,                   rwkv_wkv6_f32,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,                   rwkv_wkv7_f32,                   true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                  mul_mv_f32_f32,                  has_simdgroup_reduction);
@@ -1219,6 +1307,7 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,                 mul_mv_q5_0_f32,                 has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,                 mul_mv_q5_1_f32,                 has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,                 mul_mv_q8_0_f32,                 has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,                mul_mv_mxfp4_f32,                has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,         mul_mv_ext_f16_f32_r1_2,         has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,         mul_mv_ext_f16_f32_r1_3,         has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,         mul_mv_ext_f16_f32_r1_4,         has_simdgroup_reduction);
@@ -1243,6 +1332,10 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,        mul_mv_ext_q8_0_f32_r1_3,        has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,        mul_mv_ext_q8_0_f32_r1_4,        has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,        mul_mv_ext_q8_0_f32_r1_5,        has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_2,       mul_mv_ext_mxfp4_f32_r1_2,       has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_3,       mul_mv_ext_mxfp4_f32_r1_3,       has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_4,       mul_mv_ext_mxfp4_f32_r1_4,       has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_5,       mul_mv_ext_mxfp4_f32_r1_5,       has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,        mul_mv_ext_q4_K_f32_r1_2,        has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,        mul_mv_ext_q4_K_f32_r1_3,        has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,        mul_mv_ext_q4_K_f32_r1_4,        has_simdgroup_reduction);
@@ -1284,6 +1377,7 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,              mul_mv_id_q5_0_f32,              has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,              mul_mv_id_q5_1_f32,              has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,              mul_mv_id_q8_0_f32,              has_simdgroup_reduction);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,             mul_mv_id_mxfp4_f32,             has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,              mul_mv_id_q2_K_f32,              has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,              mul_mv_id_q3_K_f32,              has_simdgroup_reduction);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,              mul_mv_id_q4_K_f32,              has_simdgroup_reduction);
@@ -1306,6 +1400,8 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,                 mul_mm_q5_0_f32,                 has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,                 mul_mm_q5_1_f32,                 has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,                 mul_mm_q8_0_f32,                 has_simdgroup_mm);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,                 mul_mm_q2_K_f32,                 has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,                 mul_mm_q3_K_f32,                 has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,                 mul_mm_q4_K_f32,                 has_simdgroup_mm);
@@ -1330,6 +1426,7 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,              mul_mm_id_q5_0_f16,              has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,              mul_mm_id_q5_1_f16,              has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,              mul_mm_id_q8_0_f16,              has_simdgroup_mm);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,             mul_mm_id_mxfp4_f16,             has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,              mul_mm_id_q2_K_f16,              has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,              mul_mm_id_q3_K_f16,              has_simdgroup_mm);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,              mul_mm_id_q4_K_f16,              has_simdgroup_mm);
@@ -1512,6 +1609,9 @@ static struct wsp_ggml_backend_metal_context * wsp_ggml_metal_init(wsp_ggml_back
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_REGLU,                           reglu,                           true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GEGLU,                           geglu,                           true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SWIGLU,                          swiglu,                          true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SWIGLU_OAI,                      swiglu_oai,                      true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GEGLU_ERF,                       geglu_erf,                       true);
+        WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_GEGLU_QUICK,                     geglu_quick,                     true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
         WSP_GGML_METAL_ADD_KERNEL(WSP_GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
@@ -1686,6 +1786,12 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_backend_metal_devic
                 case WSP_GGML_UNARY_OP_SILU:
                 case WSP_GGML_UNARY_OP_ELU:
                 case WSP_GGML_UNARY_OP_NEG:
+                case WSP_GGML_UNARY_OP_ABS:
+                case WSP_GGML_UNARY_OP_SGN:
+                case WSP_GGML_UNARY_OP_STEP:
+                case WSP_GGML_UNARY_OP_HARDSWISH:
+                case WSP_GGML_UNARY_OP_HARDSIGMOID:
+                case WSP_GGML_UNARY_OP_EXP:
                     return wsp_ggml_is_contiguous(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
                 default:
                     return false;
@@ -1695,6 +1801,9 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_backend_metal_devic
                 case WSP_GGML_GLU_OP_REGLU:
                 case WSP_GGML_GLU_OP_GEGLU:
                 case WSP_GGML_GLU_OP_SWIGLU:
+                case WSP_GGML_GLU_OP_SWIGLU_OAI:
+                case WSP_GGML_GLU_OP_GEGLU_ERF:
+                case WSP_GGML_GLU_OP_GEGLU_QUICK:
                     return wsp_ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
                default:
                     return false;
@@ -1710,6 +1819,7 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_backend_metal_devic
         case WSP_GGML_OP_SUB:
         case WSP_GGML_OP_MUL:
         case WSP_GGML_OP_DIV:
+        case WSP_GGML_OP_ADD_ID:
             return op->src[0]->type == WSP_GGML_TYPE_F32;
         case WSP_GGML_OP_ACC:
         case WSP_GGML_OP_REPEAT:
@@ -1729,7 +1839,7 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_backend_metal_devic
         case WSP_GGML_OP_MEAN:
         case WSP_GGML_OP_SOFT_MAX:
         case WSP_GGML_OP_GROUP_NORM:
-            return has_simdgroup_reduction && wsp_ggml_is_contiguous(op->src[0]);
+            return has_simdgroup_reduction && wsp_ggml_is_contiguous_rows(op->src[0]);
         case WSP_GGML_OP_RMS_NORM:
         case WSP_GGML_OP_L2_NORM:
             return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && wsp_ggml_is_contiguous_1(op->src[0]));
@@ -1871,9 +1981,10 @@ static bool wsp_ggml_metal_supports_op(const struct wsp_ggml_backend_metal_devic
     }
 }
-static bool wsp_ggml_metal_encode_node(
+static int wsp_ggml_metal_encode_node(
                         wsp_ggml_backend_t   backend,
                                    int   idx,
+                                   int   idx_end,
           id<MTLComputeCommandEncoder>   encoder,
             struct wsp_ggml_metal_mem_pool * mem_pool) {
     struct wsp_ggml_backend_metal_context        * ctx     = backend->context;
@@ -1881,7 +1992,10 @@ static bool wsp_ggml_metal_encode_node(
     struct wsp_ggml_cgraph * gf = ctx->gf;
-    struct wsp_ggml_tensor * node = wsp_ggml_graph_node(gf, idx);
+    enum wsp_ggml_op ops[8];
+    struct wsp_ggml_tensor ** nodes = wsp_ggml_graph_nodes(gf) + idx;
+    struct wsp_ggml_tensor *  node  = nodes[0];
     //WSP_GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, wsp_ggml_op_name(node->op));
@@ -1891,7 +2005,7 @@ static bool wsp_ggml_metal_encode_node(
     struct wsp_ggml_tensor * dst  = node;
     if (wsp_ggml_is_empty(dst)) {
-        return true;
+        return 1;
     }
     switch (dst->op) {
@@ -1902,7 +2016,7 @@ static bool wsp_ggml_metal_encode_node(
         case WSP_GGML_OP_PERMUTE:
             {
                 // noop -> next node
-            } return true;
+            } return 1;
         default:
             {
             } break;
@@ -1957,6 +2071,7 @@ static bool wsp_ggml_metal_encode_node(
     const enum wsp_ggml_type src0t = src0 ? src0->type : WSP_GGML_TYPE_COUNT;
     const enum wsp_ggml_type src1t = src1 ? src1->type : WSP_GGML_TYPE_COUNT;
+    const enum wsp_ggml_type src2t = src2 ? src2->type : WSP_GGML_TYPE_COUNT;
     const enum wsp_ggml_type dstt  = dst  ? dst->type  : WSP_GGML_TYPE_COUNT;
     size_t offs_src0 = 0;
@@ -1969,6 +2084,8 @@ static bool wsp_ggml_metal_encode_node(
     id<MTLBuffer> id_src2 = src2 ? wsp_ggml_metal_get_buffer(src2, &offs_src2) : nil;
     id<MTLBuffer> id_dst  = dst  ? wsp_ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
+    int n_fuse = 1;
 #if 0
     WSP_GGML_LOG_INFO("%s: op - %s\n", __func__, wsp_ggml_op_name(dst->op));
     if (src0) {
@@ -2040,37 +2157,15 @@ static bool wsp_ggml_metal_encode_node(
                 WSP_GGML_ASSERT(src0t == WSP_GGML_TYPE_F32);
                 WSP_GGML_ASSERT(src1t == WSP_GGML_TYPE_F32);
+                WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(src0));
+                WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(src1));
                 const size_t offs = 0;
                 bool bcast_row = false;
                 id<MTLComputePipelineState> pipeline = nil;
-                if (wsp_ggml_nelements(src1) == ne10 && wsp_ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
-                    // src1 is a row
-                    WSP_GGML_ASSERT(ne11 == 1);
-                    switch (dst->op) {
-                        case WSP_GGML_OP_ADD: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
-                        case WSP_GGML_OP_SUB: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
-                        case WSP_GGML_OP_MUL: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
-                        case WSP_GGML_OP_DIV: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                        default: WSP_GGML_ABORT("fatal error");
-                    }
-                    bcast_row = true;
-                } else {
-                    switch (dst->op) {
-                        case WSP_GGML_OP_ADD: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
-                        case WSP_GGML_OP_SUB: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
-                        case WSP_GGML_OP_MUL: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                        case WSP_GGML_OP_DIV: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                        default: WSP_GGML_ABORT("fatal error");
-                    }
-                }
                 wsp_ggml_metal_kargs_bin args = {
                     /*.ne00 =*/ ne00,
                     /*.ne01 =*/ ne01,
@@ -2097,12 +2192,119 @@ static bool wsp_ggml_metal_encode_node(
                     /*.nb2  =*/ nb2,
                     /*.nb3  =*/ nb3,
                     /*.offs =*/ offs,
+                    /*.o1   =*/ { offs_src1 },
                 };
+                // c[0] = add(a,    b[0])
+                // c[1] = add(c[0], b[1])
+                // c[2] = add(c[1], b[2])
+                // ...
+                if (ctx_dev->use_fusion) {
+                    ops[0] = WSP_GGML_OP_ADD;
+                    ops[1] = WSP_GGML_OP_ADD;
+                    ops[2] = WSP_GGML_OP_ADD;
+                    ops[3] = WSP_GGML_OP_ADD;
+                    ops[4] = WSP_GGML_OP_ADD;
+                    ops[5] = WSP_GGML_OP_ADD;
+                    ops[6] = WSP_GGML_OP_ADD;
+                    ops[7] = WSP_GGML_OP_ADD;
+                    size_t offs_fuse;
+                    id<MTLBuffer> id_fuse;
+                    // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing nodes
+                    //       across splits. idx_end indicates the last node in the current split
+                    for (n_fuse = 0; n_fuse <= 6 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
+                        if (!wsp_ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
+                            break;
+                        }
+                        if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) {
+                            break;
+                        }
+                        // b[0] === b[1] === ...
+                        if (!wsp_ggml_are_same_layout(nodes[n_fuse]->src[1], nodes[n_fuse + 1]->src[1])) {
+                            break;
+                        }
+                        // only fuse nodes if src1 is in the same Metal buffer
+                        id_fuse = wsp_ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse);
+                        if (id_fuse != id_src1) {
+                            break;
+                        }
+                        ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++;
+                        args.o1[n_fuse + 1] = offs_fuse;
+                    }
+                    ++n_fuse;
+                    if (ctx_dev->debug_fusion > 1 && n_fuse > 1) {
+                        WSP_GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse);
+                    }
+                }
+                if (wsp_ggml_nelements(src1) == ne10 && wsp_ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+                    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
+                    // src1 is a row
+                    WSP_GGML_ASSERT(ne11 == 1);
+                    switch (dst->op) {
+                        case WSP_GGML_OP_ADD:
+                            {
+                                switch (n_fuse) {
+                                    case 1: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4       ].pipeline; break;
+                                    case 2: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2].pipeline; break;
+                                    case 3: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3].pipeline; break;
+                                    case 4: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4].pipeline; break;
+                                    case 5: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5].pipeline; break;
+                                    case 6: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6].pipeline; break;
+                                    case 7: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7].pipeline; break;
+                                    case 8: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8].pipeline; break;
+                                    default: WSP_GGML_ABORT("fatal error");
+                                }
+                            } break;
+                        case WSP_GGML_OP_SUB: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SUB_ROW_C4].pipeline; break;
+                        case WSP_GGML_OP_MUL: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_ROW_C4].pipeline; break;
+                        case WSP_GGML_OP_DIV: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_DIV_ROW_C4].pipeline; break;
+                        default: WSP_GGML_ABORT("fatal error");
+                    }
+                    bcast_row = true;
+                } else {
+                    switch (dst->op) {
+                        case WSP_GGML_OP_ADD:
+                            {
+                                switch (n_fuse) {
+                                    case 1: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD       ].pipeline; break;
+                                    case 2: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_2].pipeline; break;
+                                    case 3: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_3].pipeline; break;
+                                    case 4: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_4].pipeline; break;
+                                    case 5: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_5].pipeline; break;
+                                    case 6: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_6].pipeline; break;
+                                    case 7: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_7].pipeline; break;
+                                    case 8: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_FUSE_8].pipeline; break;
+                                    default: WSP_GGML_ABORT("fatal error");
+                                }
+                            } break;
+                        case WSP_GGML_OP_SUB: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
+                        case WSP_GGML_OP_MUL: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
+                        case WSP_GGML_OP_DIV: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
+                        default: WSP_GGML_ABORT("fatal error");
+                    }
+                }
+                if (n_fuse > 1) {
+                    id_dst = wsp_ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
+                }
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBytes:&args length:sizeof(args) atIndex:0];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_src1 offset:0         atIndex:2];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
                 if (bcast_row) {
@@ -2110,11 +2312,47 @@ static bool wsp_ggml_metal_encode_node(
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } else {
-                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+                    int nth = 32;
+                    while (16*nth < ne0 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                        nth *= 2;
+                    }
                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                 }
             } break;
+        case WSP_GGML_OP_ADD_ID:
+            {
+                WSP_GGML_ASSERT(src0t == WSP_GGML_TYPE_F32);
+                WSP_GGML_ASSERT(src1t == WSP_GGML_TYPE_F32);
+                WSP_GGML_ASSERT(src2t == WSP_GGML_TYPE_I32);
+                WSP_GGML_ASSERT(dstt  == WSP_GGML_TYPE_F32);
+                WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(src0));
+                id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ADD_ID].pipeline;
+                wsp_ggml_metal_kargs_add_id args = {
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb11 =*/ nb11,
+                    /*.nb21 =*/ nb21,
+                };
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:4];
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
         case WSP_GGML_OP_REPEAT:
             {
                 id<MTLComputePipelineState> pipeline;
@@ -2235,12 +2473,13 @@ static bool wsp_ggml_metal_encode_node(
                     /*.nb2  =*/ pnb2,
                     /*.nb3  =*/ pnb3,
                     /*.offs =*/ offs,
+                    /*.o1   =*/ { offs_src1},
                 };
                 [encoder setComputePipelineState:pipeline];
                 [encoder setBytes:&args length:sizeof(args) atIndex:0];
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_src1 offset:0         atIndex:2];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
                 const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
@@ -2252,7 +2491,9 @@ static bool wsp_ggml_metal_encode_node(
                 WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
                 float scale;
-                memcpy(&scale, dst->op_params, sizeof(scale));
+                float bias;
+                memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(float));
+                memcpy(&bias,  ((const int32_t *) dst->op_params) + 1, sizeof(float));
                 int64_t n = wsp_ggml_nelements(dst);
@@ -2269,6 +2510,7 @@ static bool wsp_ggml_metal_encode_node(
                 [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
                 [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                 [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+                [encoder setBytes:&bias  length:sizeof(bias)  atIndex:3];
                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
             } break;
@@ -2432,6 +2674,78 @@ static bool wsp_ggml_metal_encode_node(
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+                case WSP_GGML_UNARY_OP_ABS:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_ABS].pipeline;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    const int64_t n = wsp_ggml_nelements(dst);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case WSP_GGML_UNARY_OP_SGN:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SGN].pipeline;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    const int64_t n = wsp_ggml_nelements(dst);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case WSP_GGML_UNARY_OP_STEP:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_STEP].pipeline;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    const int64_t n = wsp_ggml_nelements(dst);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case WSP_GGML_UNARY_OP_HARDSWISH:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    const int64_t n = wsp_ggml_nelements(dst);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case WSP_GGML_UNARY_OP_HARDSIGMOID:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    const int64_t n = wsp_ggml_nelements(dst);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case WSP_GGML_UNARY_OP_EXP:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_EXP].pipeline;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                    const int64_t n = wsp_ggml_nelements(dst);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                 default:
                 {
                     WSP_GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, wsp_ggml_op_name(dst->op));
@@ -2458,11 +2772,22 @@ static bool wsp_ggml_metal_encode_node(
                     case WSP_GGML_GLU_OP_SWIGLU:
                         pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SWIGLU].pipeline;
                         break;
+                    case WSP_GGML_GLU_OP_SWIGLU_OAI:
+                        pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SWIGLU_OAI].pipeline;
+                        break;
+                    case WSP_GGML_GLU_OP_GEGLU_ERF:
+                        pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GEGLU_ERF].pipeline;
+                        break;
+                    case WSP_GGML_GLU_OP_GEGLU_QUICK:
+                        pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GEGLU_QUICK].pipeline;
+                        break;
                     default:
                         WSP_GGML_ABORT("fatal error");
                 }
-                const int32_t swp = ((const int32_t *) dst->op_params)[1];
+                const int32_t swp = wsp_ggml_get_op_params_i32(dst, 1);
+                const float alpha = wsp_ggml_get_op_params_f32(dst, 2);
+                const float limit = wsp_ggml_get_op_params_f32(dst, 3);
                 const int32_t i00 = swp ? ne0 : 0;
                 const int32_t i10 = swp ? 0 : ne0;
@@ -2476,6 +2801,8 @@ static bool wsp_ggml_metal_encode_node(
                     /*.nb1  =*/ nb1,
                     /*.i00  =*/ src1 ? 0 : i00,
                     /*.i10  =*/ src1 ? 0 : i10,
+                    /*.alpha=*/ alpha,
+                    /*.limit=*/ limit
                 };
                 [encoder setComputePipelineState:pipeline];
@@ -2648,10 +2975,7 @@ static bool wsp_ggml_metal_encode_node(
                 memcpy(&scale,    ((const int32_t *) dst->op_params) + 0, sizeof(scale));
                 memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias));
-                const int64_t nrows_x = wsp_ggml_nrows(src0);
-                const int64_t nrows_y = src0->ne[1];
-                const uint32_t n_head      = nrows_x/nrows_y;
+                const uint32_t n_head      = src0->ne[2];
                 const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
                 const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
@@ -2664,7 +2988,7 @@ static bool wsp_ggml_metal_encode_node(
                 id<MTLBuffer> h_src0 = h_src0 = wsp_ggml_metal_mem_pool_alloc(mem_pool, wsp_ggml_nbytes(src0));
                 if (!h_src0) {
                     WSP_GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, wsp_ggml_nbytes(src0));
-                    return false;
+                    return 0;
                 }
                 offs_src0 = 0;
@@ -2711,6 +3035,18 @@ static bool wsp_ggml_metal_encode_node(
                     /*.ne00        =*/ ne00,
                     /*.ne01        =*/ ne01,
                     /*.ne02        =*/ ne02,
+                    /*.nb01        =*/ nb01,
+                    /*.nb02        =*/ nb02,
+                    /*.nb03        =*/ nb03,
+                    /*.ne11        =*/ ne11,
+                    /*.ne12        =*/ ne12,
+                    /*.ne13        =*/ ne13,
+                    /*.nb11        =*/ nb11,
+                    /*.nb12        =*/ nb12,
+                    /*.nb13        =*/ nb13,
+                    /*.nb1         =*/ nb1,
+                    /*.nb2         =*/ nb2,
+                    /*.nb3         =*/ nb3,
                     /*.scale       =*/ scale,
                     /*.max_bias    =*/ max_bias,
                     /*.m0          =*/ m0,
@@ -2725,12 +3061,17 @@ static bool wsp_ggml_metal_encode_node(
                 } else {
                     [encoder setBuffer:h_src0 offset:offs_src0  atIndex:1];
                 }
-                [encoder setBuffer:id_dst offset:offs_dst       atIndex:2];
-                [encoder setBytes:&args   length:sizeof(args)   atIndex:3];
+                if (id_src2) {
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
+                } else {
+                    [encoder setBuffer:h_src0 offset:offs_src0  atIndex:2];
+                }
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:3];
+                [encoder setBytes:&args   length:sizeof(args)   atIndex:4];
                 [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case WSP_GGML_OP_DIAG_MASK_INF:
             {
@@ -2804,71 +3145,92 @@ static bool wsp_ggml_metal_encode_node(
                 struct wsp_ggml_tensor * src3 = node->src[3];
                 struct wsp_ggml_tensor * src4 = node->src[4];
                 struct wsp_ggml_tensor * src5 = node->src[5];
+                struct wsp_ggml_tensor * src6 = node->src[6];
                 WSP_GGML_ASSERT(src3);
                 WSP_GGML_ASSERT(src4);
                 WSP_GGML_ASSERT(src5);
+                WSP_GGML_ASSERT(src6);
                 size_t offs_src3 = 0;
                 size_t offs_src4 = 0;
                 size_t offs_src5 = 0;
+                size_t offs_src6 = 0;
                 id<MTLBuffer> id_src3 = src3 ? wsp_ggml_metal_get_buffer(src3, &offs_src3) : nil;
                 id<MTLBuffer> id_src4 = src4 ? wsp_ggml_metal_get_buffer(src4, &offs_src4) : nil;
                 id<MTLBuffer> id_src5 = src5 ? wsp_ggml_metal_get_buffer(src5, &offs_src5) : nil;
+                id<MTLBuffer> id_src6 = src6 ? wsp_ggml_metal_get_buffer(src6, &offs_src6) : nil;
-                const int64_t  ne30 = src3->ne[0]; WSP_GGML_UNUSED(ne30);
+                const int64_t  ne30 = src3->ne[0];
                 const int64_t  ne31 = src3->ne[1]; WSP_GGML_UNUSED(ne31);
-                const uint64_t nb30 = src3->nb[0];
+                const uint64_t nb30 = src3->nb[0]; WSP_GGML_UNUSED(nb30);
                 const uint64_t nb31 = src3->nb[1];
                 const int64_t  ne40 = src4->ne[0]; WSP_GGML_UNUSED(ne40);
-                const int64_t  ne41 = src4->ne[1]; WSP_GGML_UNUSED(ne41);
+                const int64_t  ne41 = src4->ne[1];
                 const int64_t  ne42 = src4->ne[2]; WSP_GGML_UNUSED(ne42);
+                const int64_t  ne43 = src4->ne[3]; WSP_GGML_UNUSED(ne43);
-                const uint64_t nb40 = src4->nb[0];
+                const uint64_t nb40 = src4->nb[0]; WSP_GGML_UNUSED(nb40);
                 const uint64_t nb41 = src4->nb[1];
                 const uint64_t nb42 = src4->nb[2];
+                const uint64_t nb43 = src4->nb[3];
                 const int64_t  ne50 = src5->ne[0]; WSP_GGML_UNUSED(ne50);
                 const int64_t  ne51 = src5->ne[1]; WSP_GGML_UNUSED(ne51);
                 const int64_t  ne52 = src5->ne[2]; WSP_GGML_UNUSED(ne52);
+                const int64_t  ne53 = src5->ne[3]; WSP_GGML_UNUSED(ne53);
-                const uint64_t nb50 = src5->nb[0];
+                const uint64_t nb50 = src5->nb[0]; WSP_GGML_UNUSED(nb50);
                 const uint64_t nb51 = src5->nb[1];
                 const uint64_t nb52 = src5->nb[2];
+                const uint64_t nb53 = src5->nb[3];
+                const int64_t  ne60 = src6->ne[0]; WSP_GGML_UNUSED(ne60);
+                const uint64_t nb60 = src6->nb[0]; WSP_GGML_UNUSED(nb60);
                 const int64_t d_state      = ne00;
                 const int64_t d_inner      = ne01;
-                const int64_t n_seq_tokens = ne11;
-                const int64_t n_seqs       = ne02;
+                const int64_t n_head       = ne02;
+                const int64_t n_group      = ne41;
+                const int64_t n_seq_tokens = ne12;
+                const int64_t n_seqs       = ne13;
-                id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
+                id<MTLComputePipelineState> pipeline = nil;
+                if (ne30 == 1) {
+                    // Mamba-2
+                    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP].pipeline;
+                } else {
+                    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
+                }
                 wsp_ggml_metal_kargs_ssm_scan args = {
-                    /*.d_state =*/ d_state,
-                    /*.d_inner =*/ d_inner,
+                    /*.d_state      =*/ d_state,
+                    /*.d_inner      =*/ d_inner,
+                    /*.n_head       =*/ n_head,
+                    /*.n_group      =*/ n_group,
                     /*.n_seq_tokens =*/ n_seq_tokens,
-                    /*.n_seqs =*/ n_seqs,
-                    /*.nb00 =*/ nb00,
-                    /*.nb01 =*/ nb01,
-                    /*.nb02 =*/ nb02,
-                    /*.nb10 =*/ nb10,
-                    /*.nb11 =*/ nb11,
-                    /*.nb12 =*/ nb12,
-                    /*.nb13 =*/ nb13,
-                    /*.nb20 =*/ nb20,
-                    /*.nb21 =*/ nb21,
-                    /*.nb22 =*/ nb22,
-                    /*.nb30 =*/ nb30,
-                    /*.nb31 =*/ nb31,
-                    /*.nb40 =*/ nb40,
-                    /*.nb41 =*/ nb41,
-                    /*.nb42 =*/ nb42,
-                    /*.nb50 =*/ nb50,
-                    /*.nb51 =*/ nb51,
-                    /*.nb52 =*/ nb52,
+                    /*.n_seqs       =*/ n_seqs,
+                    /*.s_off        =*/ wsp_ggml_nelements(src1) * sizeof(float),
+                    /*.nb01         =*/ nb01,
+                    /*.nb02         =*/ nb02,
+                    /*.nb03         =*/ nb03,
+                    /*.nb11         =*/ nb11,
+                    /*.nb12         =*/ nb12,
+                    /*.nb13         =*/ nb13,
+                    /*.nb21         =*/ nb21,
+                    /*.nb22         =*/ nb22,
+                    /*.nb31         =*/ nb31,
+                    /*.nb41         =*/ nb41,
+                    /*.nb42         =*/ nb42,
+                    /*.nb43         =*/ nb43,
+                    /*.nb51         =*/ nb51,
+                    /*.nb52         =*/ nb52,
+                    /*.nb53         =*/ nb53,
                 };
                 [encoder setComputePipelineState:pipeline];
@@ -2878,10 +3240,27 @@ static bool wsp_ggml_metal_encode_node(
                 [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
                 [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
                 [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:6];
-                [encoder setBytes:&args    length:sizeof(args) atIndex:7];
+                [encoder setBuffer:id_src6 offset:offs_src6 atIndex:6];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:7];
+                [encoder setBytes:&args    length:sizeof(args) atIndex:8];
+                // One shared memory bucket for each simd group in the threadgroup
+                // NOTE: Metal kernels require the buffer size to be multiple of 16 bytes
+                //  https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
+                if (d_state >= 32) {
+                    WSP_GGML_ASSERT((int64_t)(d_state / 32) <= 32);
+                    const int64_t shmem_size = 32;
+                    WSP_GGML_ASSERT(d_state <= (int64_t)pipeline.maxTotalThreadsPerThreadgroup);
+                    [encoder setThreadgroupMemoryLength:(shmem_size)*sizeof(float) atIndex:0];
+                }
-                [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                if (ne30 == 1) {
+                    // Mamba-2
+                    [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
+                } else {
+                    WSP_GGML_ASSERT(d_inner == 1);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
+                }
             } break;
         case WSP_GGML_OP_RWKV_WKV6:
             {
@@ -2986,6 +3365,7 @@ static bool wsp_ggml_metal_encode_node(
                        src0t == WSP_GGML_TYPE_Q5_0 ||
                        src0t == WSP_GGML_TYPE_Q5_1 ||
                        src0t == WSP_GGML_TYPE_Q8_0 ||
+                       src0t == WSP_GGML_TYPE_MXFP4 ||
                        src0t == WSP_GGML_TYPE_IQ4_NL ||
                        false) && (ne11 >= 2 && ne11 <= 8)
                      ) ||
@@ -3078,6 +3458,14 @@ static bool wsp_ggml_metal_encode_node(
                                 case 5: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5].pipeline; break;
                                 default: WSP_GGML_ABORT("not implemented");
                             } break;
+                        case WSP_GGML_TYPE_MXFP4:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_MXFP4_F32_R1_5].pipeline; break;
+                                default: WSP_GGML_ABORT("not implemented");
+                            } break;
                         case WSP_GGML_TYPE_Q4_K:
                             switch (r1ptg) {
                                 case 2: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2].pipeline; break;
@@ -3176,6 +3564,7 @@ static bool wsp_ggml_metal_encode_node(
                         case WSP_GGML_TYPE_Q5_0:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
                         case WSP_GGML_TYPE_Q5_1:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
                         case WSP_GGML_TYPE_Q8_0:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                        case WSP_GGML_TYPE_MXFP4:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32   ].pipeline; break;
                         case WSP_GGML_TYPE_Q2_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
                         case WSP_GGML_TYPE_Q3_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
                         case WSP_GGML_TYPE_Q4_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
@@ -3318,6 +3707,13 @@ static bool wsp_ggml_metal_encode_node(
                                 nr0 = N_R0_Q8_0;
                                 pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
                             } break;
+                        case WSP_GGML_TYPE_MXFP4:
+                            {
+                                nsg = N_SG_MXFP4;
+                                nr0 = N_R0_MXFP4;
+                                smem = 32*sizeof(float);
+                                pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32].pipeline;
+                            } break;
                         case WSP_GGML_TYPE_Q2_K:
                             {
                                 nsg = N_SG_Q2_K;
@@ -3451,8 +3847,6 @@ static bool wsp_ggml_metal_encode_node(
         case WSP_GGML_OP_MUL_MAT_ID:
             {
                 // src2 = ids
-                const enum wsp_ggml_type src2t = src2->type; WSP_GGML_UNUSED(src2t);
                 WSP_GGML_ASSERT(src2t == WSP_GGML_TYPE_I32);
                 WSP_GGML_ASSERT(!wsp_ggml_is_transposed(src0));
@@ -3501,7 +3895,7 @@ static bool wsp_ggml_metal_encode_node(
                     id<MTLBuffer> h_src1 = wsp_ggml_metal_mem_pool_alloc(mem_pool, s_src1);
                     if (!h_src1) {
                         WSP_GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
-                        return false;
+                        return 0;
                     }
                     const int64_t neh0 = ne0;
@@ -3517,7 +3911,7 @@ static bool wsp_ggml_metal_encode_node(
                     id<MTLBuffer> h_dst = wsp_ggml_metal_mem_pool_alloc(mem_pool, s_dst);
                     if (!h_dst) {
                         WSP_GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
-                        return false;
+                        return 0;
                     }
                     // tokens per expert
@@ -3525,7 +3919,7 @@ static bool wsp_ggml_metal_encode_node(
                     id<MTLBuffer> h_tpe = wsp_ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
                     if (!h_tpe) {
                         WSP_GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
-                        return false;
+                        return 0;
                     }
                     // id map
@@ -3534,7 +3928,7 @@ static bool wsp_ggml_metal_encode_node(
                     id<MTLBuffer> h_ids = wsp_ggml_metal_mem_pool_alloc(mem_pool, s_ids);
                     if (!h_ids) {
                         WSP_GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
-                        return false;
+                        return 0;
                     }
                     {
@@ -3578,6 +3972,7 @@ static bool wsp_ggml_metal_encode_node(
                             case WSP_GGML_TYPE_Q5_0:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16   ].pipeline; break;
                             case WSP_GGML_TYPE_Q5_1:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16   ].pipeline; break;
                             case WSP_GGML_TYPE_Q8_0:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16   ].pipeline; break;
+                            case WSP_GGML_TYPE_MXFP4:   pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16  ].pipeline; break;
                             case WSP_GGML_TYPE_Q2_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16   ].pipeline; break;
                             case WSP_GGML_TYPE_Q3_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16   ].pipeline; break;
                             case WSP_GGML_TYPE_Q4_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16   ].pipeline; break;
@@ -3713,6 +4108,13 @@ static bool wsp_ggml_metal_encode_node(
                                 nr0 = N_R0_Q8_0;
                                 pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
                             } break;
+                        case WSP_GGML_TYPE_MXFP4:
+                            {
+                                nsg = N_SG_MXFP4;
+                                nr0 = N_R0_MXFP4;
+                                smem = 32*sizeof(float);
+                                pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32].pipeline;
+                            } break;
                         case WSP_GGML_TYPE_Q2_K:
                             {
                                 nsg = N_SG_Q2_K;
@@ -3865,6 +4267,7 @@ static bool wsp_ggml_metal_encode_node(
                     case WSP_GGML_TYPE_Q5_0:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
                     case WSP_GGML_TYPE_Q5_1:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
                     case WSP_GGML_TYPE_Q8_0:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
+                    case WSP_GGML_TYPE_MXFP4:   pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_MXFP4  ].pipeline; break;
                     case WSP_GGML_TYPE_Q2_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
                     case WSP_GGML_TYPE_Q3_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
                     case WSP_GGML_TYPE_Q4_K:    pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
@@ -3966,12 +4369,95 @@ static bool wsp_ggml_metal_encode_node(
         case WSP_GGML_OP_RMS_NORM:
             {
                 WSP_GGML_ASSERT(ne00 % 4 == 0);
-                WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0));
+                WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(src0));
                 float eps;
                 memcpy(&eps, dst->op_params, sizeof(float));
-                id<MTLComputePipelineState> pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+                wsp_ggml_metal_kargs_rms_norm args = {
+                    /*.ne00   =*/ ne00,
+                    /*.ne00_4 =*/ ne00/4,
+                    /*.nb1    =*/ nb1,
+                    /*.nb2    =*/ nb2,
+                    /*.nb3    =*/ nb3,
+                    /*.eps    =*/ eps,
+                    /*.nef1   =*/ { ne01 },
+                    /*.nef2   =*/ { ne02 },
+                    /*.nef3   =*/ { ne03 },
+                    /*.nbf1   =*/ { nb01 },
+                    /*.nbf2   =*/ { nb02 },
+                    /*.nbf3   =*/ { nb03 },
+                };
+                size_t offs_fuse[2] = { 0, 0 };
+                id<MTLBuffer> id_fuse[2] = { id_src0, id_src0 };
+                // d[0] = rms_norm(a)
+                // d[1] = mul(d[0], b)
+                // d[2] = add(d[1], c)
+                if (ctx_dev->use_fusion) {
+                    ops[0] = WSP_GGML_OP_RMS_NORM;
+                    ops[1] = WSP_GGML_OP_MUL;
+                    ops[2] = WSP_GGML_OP_ADD;
+                    for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
+                        if (!wsp_ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
+                            break;
+                        }
+                        if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) {
+                            break;
+                        }
+                        if (nodes[n_fuse + 1]->src[1]->ne[0] != node->ne[0]) {
+                            break;
+                        }
+                        if (!wsp_ggml_is_contiguous_rows(nodes[n_fuse + 1]->src[1])) {
+                            break;
+                        }
+                        if (nodes[n_fuse + 1]->type != WSP_GGML_TYPE_F32) {
+                            break;
+                        }
+                        ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++;
+                        id_fuse[n_fuse] = wsp_ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse[n_fuse]);
+                        args.nef1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[1];
+                        args.nef2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[2];
+                        args.nef3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[3];
+                        args.nbf1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[1];
+                        args.nbf2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[2];
+                        args.nbf3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[3];
+                    }
+                    ++n_fuse;
+                    if (ctx_dev->debug_fusion > 1 && n_fuse > 1) {
+                        if (n_fuse == 2) {
+                            WSP_GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__);
+                        }
+                        if (n_fuse == 3) {
+                            WSP_GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__);
+                        }
+                    }
+                }
+                if (n_fuse > 1) {
+                    id_dst = wsp_ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
+                }
+                id<MTLComputePipelineState> pipeline;
+                switch (n_fuse) {
+                    case 1: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM        ].pipeline; break;
+                    case 2: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL    ].pipeline; break;
+                    case 3: pipeline = ctx->kernels[WSP_GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD].pipeline; break;
+                    default: WSP_GGML_ABORT("unsupported n_fuse = %d\n", n_fuse);
+                }
                 int nth = 32; // SIMD width
@@ -3982,23 +4468,16 @@ static bool wsp_ggml_metal_encode_node(
                 nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
-                wsp_ggml_metal_kargs_rms_norm args = {
-                    /*.ne00   =*/ ne00,
-                    /*.ne00_4 =*/ ne00/4,
-                    /*.nb01   =*/ nb01,
-                    /*.eps    =*/ eps,
-                };
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBytes:&args length:sizeof(args)       atIndex:0];
+                [encoder setBuffer:id_src0    offset:offs_src0    atIndex:1];
+                [encoder setBuffer:id_fuse[0] offset:offs_fuse[0] atIndex:2];
+                [encoder setBuffer:id_fuse[1] offset:offs_fuse[1] atIndex:3];
+                [encoder setBuffer:id_dst     offset:offs_dst     atIndex:4];
                 [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-                const int64_t nrows = wsp_ggml_nrows(src0);
-                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case WSP_GGML_OP_L2_NORM:
             {
@@ -4599,11 +5078,14 @@ static bool wsp_ggml_metal_encode_node(
                 WSP_GGML_ASSERT(ne11 == ne21);
                 WSP_GGML_ASSERT(ne12 == ne22);
-                struct wsp_ggml_tensor * src3 = node->src[3];
+                struct wsp_ggml_tensor * src3 = node->src[3]; // mask
+                struct wsp_ggml_tensor * src4 = node->src[4]; // sinks
                 size_t offs_src3 = 0;
+                size_t offs_src4 = 0;
                 id<MTLBuffer> id_src3 = src3 ? wsp_ggml_metal_get_buffer(src3, &offs_src3) : nil;
+                id<MTLBuffer> id_src4 = src4 ? wsp_ggml_metal_get_buffer(src4, &offs_src4) : nil;
                 WSP_GGML_ASSERT(!src3 || src3->type == WSP_GGML_TYPE_F16);
                 WSP_GGML_ASSERT(!src3 || src3->ne[1] >= WSP_GGML_PAD(src0->ne[1], 8) &&
@@ -4619,8 +5101,6 @@ static bool wsp_ggml_metal_encode_node(
                 const uint64_t nb32 = src3 ? src3->nb[2] : 0; WSP_GGML_UNUSED(nb32);
                 const uint64_t nb33 = src3 ? src3->nb[3] : 0; WSP_GGML_UNUSED(nb33);
-                const enum wsp_ggml_type src2t = src2 ? src2->type : WSP_GGML_TYPE_COUNT; WSP_GGML_UNUSED(src2t);
                 float scale;
                 float max_bias;
                 float logit_softcap;
@@ -4983,7 +5463,11 @@ static bool wsp_ggml_metal_encode_node(
                     /*.nb21          =*/ nb21,
                     /*.nb22          =*/ nb22,
                     /*.nb23          =*/ nb23,
+                    /*.ne32          =*/ ne32,
+                    /*.ne33          =*/ ne33,
                     /*.nb31          =*/ nb31,
+                    /*.nb32          =*/ nb32,
+                    /*.nb33          =*/ nb33,
                     /*.ne1           =*/ ne1,
                     /*.ne2           =*/ ne2,
                     /*.scale         =*/ scale,
@@ -5004,7 +5488,12 @@ static bool wsp_ggml_metal_encode_node(
                 } else {
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:4];
                 }
-                [encoder setBuffer:id_dst offset:offs_dst       atIndex:5];
+                if (id_src4) {
+                    [encoder setBuffer:id_src4 offset:offs_src4 atIndex:5];
+                } else {
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:5];
+                }
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:6];
                 if (!use_vec_kernel) {
                     // half8x8 kernel
@@ -5389,7 +5878,7 @@ static bool wsp_ggml_metal_encode_node(
             }
     }
-    return true;
+    return n_fuse;
 }
 static enum wsp_ggml_status wsp_ggml_metal_graph_compute(
@@ -5895,20 +6384,26 @@ static void wsp_ggml_backend_metal_set_n_cb(wsp_ggml_backend_t backend, int n_cb
         struct wsp_ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
         wsp_ggml_metal_mem_pool_reset(mem_pool);
-        for (int idx = node_start; idx < node_end; ++idx) {
+        for (int idx = node_start; idx < node_end;) {
             if (should_capture) {
                 [encoder pushDebugGroup:[NSString stringWithCString:wsp_ggml_op_desc(wsp_ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
-            const bool res = wsp_ggml_metal_encode_node(backend, idx, encoder, mem_pool);
+            const int res = wsp_ggml_metal_encode_node(backend, idx, node_end, encoder, mem_pool);
+            if (idx + res > node_end) {
+                WSP_GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
+                        "https://github.com/ggml-org/llama.cpp/pull/14849");
+            }
             if (should_capture) {
                 [encoder popDebugGroup];
             }
-            if (!res) {
+            if (res == 0) {
                 break;
             }
+            idx += res;
         }
         [encoder endEncoding];