npm - whisper.rn - Versions diffs - 0.5.2 → 0.5.4 - Mend

whisper.rn 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/cpp/ggml-metal/ggml-metal-device.cpp CHANGED Viewed

@@ -318,6 +318,44 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_sum_rows(wsp_ggml_
     return res;
 }
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_cumsum_blk(wsp_ggml_metal_library_t lib, const wsp_ggml_tensor * op) {
+    WSP_GGML_ASSERT(op->op == WSP_GGML_OP_CUMSUM);
+    char base[256];
+    char name[256];
+    snprintf(base, 256, "kernel_cumsum_blk_%s", wsp_ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+    wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
+    if (res) {
+        return res;
+    }
+    res = wsp_ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    return res;
+}
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_cumsum_add(wsp_ggml_metal_library_t lib, const wsp_ggml_tensor * op) {
+    WSP_GGML_ASSERT(op->op == WSP_GGML_OP_CUMSUM);
+    char base[256];
+    char name[256];
+    snprintf(base, 256, "kernel_cumsum_add_%s", wsp_ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+    wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
+    if (res) {
+        return res;
+    }
+    res = wsp_ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    return res;
+}
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_soft_max(wsp_ggml_metal_library_t lib, const wsp_ggml_tensor * op) {
     WSP_GGML_ASSERT(!op->src[1] || op->src[1]->type == WSP_GGML_TYPE_F16 || op->src[1]->type == WSP_GGML_TYPE_F32);
@@ -677,7 +715,7 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_mul_mm_id_map0(wsp
     char name[256];
     snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
-    snprintf(name, 256, "%s", base);
+    snprintf(name, 256, "%s_ne02=%d", base, ne02);
     wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
     if (res) {
@@ -943,6 +981,34 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_argsort(wsp_ggml_m
     return res;
 }
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_argsort_merge(wsp_ggml_metal_library_t lib, const wsp_ggml_tensor * op) {
+    assert(op->op == WSP_GGML_OP_ARGSORT);
+    char base[256];
+    char name[256];
+    wsp_ggml_sort_order order = (wsp_ggml_sort_order) op->op_params[0];
+    const char * order_str = "undefined";
+    switch (order) {
+        case WSP_GGML_SORT_ORDER_ASC:  order_str = "asc";  break;
+        case WSP_GGML_SORT_ORDER_DESC: order_str = "desc"; break;
+        default: WSP_GGML_ABORT("fatal error");
+    };
+    snprintf(base, 256, "kernel_argsort_merge_%s_%s_%s", wsp_ggml_type_name(op->src[0]->type), wsp_ggml_type_name(op->type), order_str);
+    snprintf(name, 256, "%s", base);
+    wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
+    if (res) {
+        return res;
+    }
+    res = wsp_ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    return res;
+}
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_flash_attn_ext_pad(
         wsp_ggml_metal_library_t lib,
         const struct wsp_ggml_tensor * op,
@@ -1332,11 +1398,12 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_rope(wsp_ggml_meta
     const bool is_neox   = mode & WSP_GGML_ROPE_TYPE_NEOX;
     const bool is_mrope  = mode & WSP_GGML_ROPE_TYPE_MROPE;
+    const bool is_imrope = mode == WSP_GGML_ROPE_TYPE_IMROPE;
     const bool is_vision = mode == WSP_GGML_ROPE_TYPE_VISION;
     if (is_neox) {
         snprintf(base, 256, "kernel_rope_neox_%s", wsp_ggml_type_name(op->src[0]->type));
-    } else if (is_mrope && !is_vision) {
+    } else if ((is_mrope || is_imrope) && !is_vision) {
         WSP_GGML_ASSERT(op->src[1]->ne[0]*4 >= op->src[0]->ne[2]); // need at least 4 pos per token
         snprintf(base, 256, "kernel_rope_multi_%s", wsp_ggml_type_name(op->src[0]->type));
     } else if (is_vision) {
@@ -1346,14 +1413,20 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_rope(wsp_ggml_meta
         snprintf(base, 256, "kernel_rope_norm_%s", wsp_ggml_type_name(op->src[0]->type));
     }
-    snprintf(name, 256, "%s", base);
+    snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
     wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
     if (res) {
         return res;
     }
-    res = wsp_ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    wsp_ggml_metal_cv_t cv = wsp_ggml_metal_cv_init();
+    wsp_ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
+    res = wsp_ggml_metal_library_compile_pipeline(lib, base, name, cv);
+    wsp_ggml_metal_cv_free(cv);
     return res;
 }
@@ -1431,6 +1504,30 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_conv_transpose_2d(
     return res;
 }
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_conv_2d(wsp_ggml_metal_library_t lib, const wsp_ggml_tensor * op) {
+    assert(op->op == WSP_GGML_OP_CONV_2D);
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(op->src[0]));
+    WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F16 || op->src[0]->type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT(op->src[1]->type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT(op->type         == WSP_GGML_TYPE_F32);
+    char base[256];
+    char name[256];
+    snprintf(base, 256, "kernel_conv_2d_%s_%s", wsp_ggml_type_name(op->src[0]->type), wsp_ggml_type_name(op->src[1]->type));
+    snprintf(name, 256, "%s", base);
+    wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
+    if (res) {
+        return res;
+    }
+    res = wsp_ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    return res;
+}
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_upscale(wsp_ggml_metal_library_t lib, const wsp_ggml_tensor * op) {
     assert(op->op == WSP_GGML_OP_UPSCALE);

package/cpp/ggml-metal/ggml-metal-device.h CHANGED Viewed

@@ -95,7 +95,9 @@ void wsp_ggml_metal_encoder_end_encoding(wsp_ggml_metal_encoder_t encoder);
 typedef struct wsp_ggml_metal_library * wsp_ggml_metal_library_t;
-wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev);
+wsp_ggml_metal_library_t wsp_ggml_metal_library_init            (wsp_ggml_metal_device_t dev);
+wsp_ggml_metal_library_t wsp_ggml_metal_library_init_from_source(wsp_ggml_metal_device_t dev, const char * source, bool verbose);
 void wsp_ggml_metal_library_free(wsp_ggml_metal_library_t lib);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline    (wsp_ggml_metal_library_t lib, const char * name);
@@ -111,6 +113,8 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_unary
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_glu               (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_sum               (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_sum_rows          (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_cumsum_blk        (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_cumsum_add        (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_soft_max          (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_ssm_conv          (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_ssm_scan          (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
@@ -123,6 +127,7 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_mul_mm_id
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_mul_mv_id         (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_argmax            (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_argsort           (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_argsort_merge     (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_bin               (wsp_ggml_metal_library_t lib, enum wsp_ggml_op op, int32_t n_fuse, bool row);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_l2_norm           (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_group_norm        (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
@@ -131,6 +136,7 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_rope
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_im2col            (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_conv_transpose_1d (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_conv_transpose_2d (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_conv_2d           (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_upscale           (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_pad               (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
 wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline_pad_reflect_1d    (wsp_ggml_metal_library_t lib, const struct wsp_ggml_tensor * op);
@@ -193,6 +199,7 @@ struct wsp_ggml_metal_device_props {
     bool has_simdgroup_mm;
     bool has_unified_memory;
     bool has_bfloat;
+    bool has_tensor;
     bool use_residency_sets;
     bool use_shared_buffers;

package/cpp/ggml-metal/ggml-metal-device.m CHANGED Viewed

@@ -21,8 +21,9 @@
 #define WSP_GGML_METAL_HAS_RESIDENCY_SETS 1
 #endif
-// overload of MTLGPUFamilyMetal3 (not available in some environments)
+// overload of MTLGPUFamilyMetalX (not available in some environments)
 static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
 // virtual address for GPU memory allocations
 static atomic_uintptr_t g_addr_device = 0x000000400ULL;
@@ -180,11 +181,7 @@ wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev
         NSBundle * bundle = [NSBundle bundleForClass:[WSPGGMLMetalClass class]];
 #endif
-#if TARGET_OS_SIMULATOR
-        NSString * path_lib = [bundle pathForResource:@"ggml-whisper-sim" ofType:@"metallib"];
-#else
-        NSString * path_lib = [bundle pathForResource:@"ggml-whisper" ofType:@"metallib"];
-#endif
+        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
         if (path_lib == nil) {
             // Try to find the resource in the directory where the current binary located.
             NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
@@ -265,6 +262,10 @@ wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev
                     [prep setObject:@"1" forKey:@"WSP_GGML_METAL_HAS_BF16"];
                 }
+                if (wsp_ggml_metal_device_get_props(dev)->has_tensor) {
+                    [prep setObject:@"1" forKey:@"WSP_GGML_METAL_HAS_TENSOR"];
+                }
 #if WSP_GGML_METAL_EMBED_LIBRARY
                 [prep setObject:@"1" forKey:@"WSP_GGML_METAL_EMBED_LIBRARY"];
 #endif
@@ -302,6 +303,72 @@ wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev
     return res;
 }
+wsp_ggml_metal_library_t wsp_ggml_metal_library_init_from_source(wsp_ggml_metal_device_t dev, const char * source, bool verbose) {
+    if (source == NULL) {
+        WSP_GGML_LOG_ERROR("%s: source is NULL\n", __func__);
+        return NULL;
+    }
+    id<MTLDevice> device = wsp_ggml_metal_device_get_obj(dev);
+    id<MTLLibrary> library = nil;
+    NSError * error = nil;
+    const int64_t t_start = wsp_ggml_time_us();
+    NSString * src = [[NSString alloc] initWithBytes:source
+                                              length:strlen(source)
+                                            encoding:NSUTF8StringEncoding];
+    if (!src) {
+        WSP_GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
+        return NULL;
+    }
+    @autoreleasepool {
+        NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+        MTLCompileOptions * options = [MTLCompileOptions new];
+        options.preprocessorMacros = prep;
+        library = [device newLibraryWithSource:src options:options error:&error];
+        if (error) {
+            if (verbose) {
+                WSP_GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
+            } else {
+                WSP_GGML_LOG_ERROR("%s: error compiling source\n", __func__);
+            }
+            library = nil;
+        }
+        [options release];
+    }
+    [src release];
+    if (!library) {
+        if (verbose) {
+            WSP_GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
+        }
+        return NULL;
+    }
+    if (verbose) {
+        WSP_GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (wsp_ggml_time_us() - t_start) / 1e6);
+    }
+    wsp_ggml_metal_library_t res = calloc(1, sizeof(struct wsp_ggml_metal_library));
+    if (!res) {
+        WSP_GGML_LOG_ERROR("%s: calloc failed\n", __func__);
+        return NULL;
+    }
+    res->obj       = library;
+    res->device    = device;
+    res->pipelines = wsp_ggml_metal_pipelines_init();
+    return res;
+}
 void wsp_ggml_metal_library_free(wsp_ggml_metal_library_t lib) {
     if (!lib) {
         return;
@@ -349,9 +416,9 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_compile_pipeline(wsp_ggml_metal
         if (!mtl_function) {
             wsp_ggml_critical_section_end();
-            WSP_GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
+            WSP_GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
             if (error) {
-                WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                WSP_GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
             }
             return nil;
@@ -359,13 +426,21 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_compile_pipeline(wsp_ggml_metal
         res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
-        wsp_ggml_metal_pipelines_add(lib->pipelines, name, res);
         [mtl_function release];
         WSP_GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
                 (int) res->obj.maxTotalThreadsPerThreadgroup,
                 (int) res->obj.threadExecutionWidth);
+        if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
+            wsp_ggml_critical_section_end();
+            WSP_GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
+            return nil;
+        }
+        wsp_ggml_metal_pipelines_add(lib->pipelines, name, res);
     }
     wsp_ggml_critical_section_end();
@@ -473,6 +548,128 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
             dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
             dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
+            if (getenv("WSP_GGML_METAL_BF16_DISABLE") != NULL) {
+                dev->props.has_bfloat = false;
+            }
+            dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
+            if (getenv("WSP_GGML_METAL_TENSOR_DISABLE") != NULL) {
+                dev->props.has_tensor = false;
+            }
+            // note: disable the tensor API by default for old chips because with the current implementation it is not useful
+            // - M2 Ultra:   ~5% slower
+            // - M4, M4 Max: no significant difference
+            //
+            // TODO: try to update the tensor API kernels to at least match the simdgroup performance
+            if (getenv("WSP_GGML_METAL_TENSOR_ENABLE") == NULL &&
+                ![[dev->mtl_device name] containsString:@"M5"] &&
+                ![[dev->mtl_device name] containsString:@"M6"] &&
+                ![[dev->mtl_device name] containsString:@"A19"] &&
+                ![[dev->mtl_device name] containsString:@"A20"]) {
+                WSP_GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
+                dev->props.has_tensor = false;
+            }
+            // double-check that the tensor API compiles
+            if (dev->props.has_tensor) {
+                const char * src_tensor_f16 = "\n"
+                    "#include <metal_stdlib> \n"
+                    "#include <metal_tensor> \n"
+                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
+                    " \n"
+                    "using namespace metal; \n"
+                    "using namespace mpp::tensor_ops; \n"
+                    " \n"
+                    "kernel void dummy_kernel( \n"
+                    "    tensor<device  half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
+                    "    tensor<device  half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
+                    "    device float * C [[buffer(2)]], \n"
+                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
+                    "{ \n"
+                    "    auto tA = A.slice(0, (int)tgid.y); \n"
+                    "    auto tB = B.slice((int)tgid.x, 0); \n"
+                    " \n"
+                    "    matmul2d< \n"
+                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
+                    "        execution_simdgroups<4>> mm; \n"
+                    " \n"
+                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
+                    " \n"
+                    "    auto sA = tA.slice(0, 0); \n"
+                    "    auto sB = tB.slice(0, 0); \n"
+                    "    mm.run(sB, sA, cT); \n"
+                    " \n"
+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
+                    " \n"
+                    "    cT.store(tC); \n"
+                    "}";
+                WSP_GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
+                wsp_ggml_metal_library_t lib = wsp_ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
+                if (lib == NULL) {
+                    WSP_GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
+                    dev->props.has_tensor = false;
+                } else {
+                    wsp_ggml_metal_pipeline_t ppl = wsp_ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
+                    if (!ppl) {
+                        WSP_GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
+                        dev->props.has_tensor = false;
+                    }
+                    wsp_ggml_metal_library_free(lib);
+                }
+            }
+            // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
+            if (dev->props.has_tensor && dev->props.has_bfloat) {
+                const char * src_tensor_bf16 = "\n"
+                    "#include <metal_stdlib> \n"
+                    "#include <metal_tensor> \n"
+                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
+                    " \n"
+                    "using namespace metal; \n"
+                    "using namespace mpp::tensor_ops; \n"
+                    " \n"
+                    "kernel void dummy_kernel( \n"
+                    "    tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
+                    "    tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
+                    "    device float * C [[buffer(2)]], \n"
+                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
+                    "{ \n"
+                    "    auto tA = A.slice(0, (int)tgid.y); \n"
+                    "    auto tB = B.slice((int)tgid.x, 0); \n"
+                    " \n"
+                    "    matmul2d< \n"
+                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
+                    "        execution_simdgroups<4>> mm; \n"
+                    " \n"
+                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
+                    " \n"
+                    "    auto sA = tA.slice(0, 0); \n"
+                    "    auto sB = tB.slice(0, 0); \n"
+                    "    mm.run(sB, sA, cT); \n"
+                    " \n"
+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
+                    " \n"
+                    "    cT.store(tC); \n"
+                    "}";
+                WSP_GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
+                wsp_ggml_metal_library_t lib = wsp_ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
+                if (lib == NULL) {
+                    WSP_GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
+                    dev->props.has_bfloat = false;
+                } else {
+                    wsp_ggml_metal_pipeline_t ppl = wsp_ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
+                    if (!ppl) {
+                        WSP_GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
+                        dev->props.has_bfloat = false;
+                    }
+                    wsp_ggml_metal_library_free(lib);
+                }
+            }
             dev->props.use_residency_sets = true;
 #if defined(WSP_GGML_METAL_HAS_RESIDENCY_SETS)
@@ -480,7 +677,6 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
 #endif
             dev->props.use_shared_buffers = dev->props.has_unified_memory;
             if (getenv("WSP_GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
                 dev->props.use_shared_buffers = false;
             }
@@ -533,6 +729,7 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
             WSP_GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm        ? "true" : "false");
             WSP_GGML_LOG_INFO("%s: has unified memory    = %s\n", __func__, dev->props.has_unified_memory      ? "true" : "false");
             WSP_GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, dev->props.has_bfloat              ? "true" : "false");
+            WSP_GGML_LOG_INFO("%s: has tensor            = %s\n", __func__, dev->props.has_tensor              ? "true" : "false");
             WSP_GGML_LOG_INFO("%s: use residency sets    = %s\n", __func__, dev->props.use_residency_sets      ? "true" : "false");
             WSP_GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, dev->props.use_shared_buffers      ? "true" : "false");
@@ -673,6 +870,7 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
         case WSP_GGML_OP_SUM:
             return has_simdgroup_reduction && wsp_ggml_is_contiguous(op->src[0]);
         case WSP_GGML_OP_SUM_ROWS:
+        case WSP_GGML_OP_CUMSUM:
         case WSP_GGML_OP_MEAN:
         case WSP_GGML_OP_SOFT_MAX:
         case WSP_GGML_OP_GROUP_NORM:
@@ -688,6 +886,11 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
             return true;
         case WSP_GGML_OP_IM2COL:
             return wsp_ggml_is_contiguous(op->src[1]) && op->src[1]->type == WSP_GGML_TYPE_F32 && (op->type == WSP_GGML_TYPE_F16 || op->type == WSP_GGML_TYPE_F32);
+        case WSP_GGML_OP_CONV_2D:
+            return wsp_ggml_is_contiguous(op->src[0]) &&
+                   op->src[1]->type == WSP_GGML_TYPE_F32 &&
+                   op->type == WSP_GGML_TYPE_F32 &&
+                   (op->src[0]->type == WSP_GGML_TYPE_F16 || op->src[0]->type == WSP_GGML_TYPE_F32);
         case WSP_GGML_OP_POOL_1D:
             return false;
         case WSP_GGML_OP_UPSCALE:
@@ -702,8 +905,6 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
         case WSP_GGML_OP_LEAKY_RELU:
             return op->src[0]->type == WSP_GGML_TYPE_F32;
         case WSP_GGML_OP_ARGSORT:
-            // TODO: Support arbitrary column width
-            return op->src[0]->ne[0] <= 1024;
         case WSP_GGML_OP_ARANGE:
             return true;
         case WSP_GGML_OP_FLASH_ATTN_EXT:
@@ -711,6 +912,7 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
             if (op->src[0]->ne[0] != 32 &&
                 op->src[0]->ne[0] != 40 &&
                 op->src[0]->ne[0] != 64 &&
+                op->src[0]->ne[0] != 72 &&
                 op->src[0]->ne[0] != 80 &&
                 op->src[0]->ne[0] != 96 &&
                 op->src[0]->ne[0] != 112 &&
@@ -787,7 +989,7 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
                                 return false;
                         }
                     case WSP_GGML_TYPE_I32:
-                        return op->type == WSP_GGML_TYPE_F32;
+                        return op->type == WSP_GGML_TYPE_F32 || op->type == WSP_GGML_TYPE_I32;
                     default:
                         return false;
                 };

package/cpp/ggml-metal/ggml-metal-impl.h CHANGED Viewed

@@ -76,6 +76,7 @@
 #define FC_FLASH_ATTN_EXT_VEC_REDUCE   500
 #define FC_MUL_MV                      600
 #define FC_MUL_MM                      700
+#define FC_ROPE                        800
 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPTG 8
@@ -527,6 +528,36 @@ typedef struct {
     uint64_t nb2;
 } wsp_ggml_metal_kargs_conv_transpose_2d;
+typedef struct {
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  IW;
+    int32_t  IH;
+    int32_t  KW;
+    int32_t  KH;
+    int32_t  IC;
+    int32_t  OC;
+    int32_t  OW;
+    int32_t  OH;
+    int32_t  N;
+    int32_t  s0;
+    int32_t  s1;
+    int32_t  p0;
+    int32_t  p1;
+    int32_t  d0;
+    int32_t  d1;
+} wsp_ggml_metal_kargs_conv_2d;
 typedef struct {
     uint64_t  ofs0;
     uint64_t  ofs1;
@@ -581,6 +612,45 @@ typedef struct {
     uint64_t nb3;
 } wsp_ggml_metal_kargs_sum_rows;
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int64_t  net0;
+    int64_t  net1;
+    int64_t  net2;
+    int64_t  net3;
+    uint64_t nbt0;
+    uint64_t nbt1;
+    uint64_t nbt2;
+    uint64_t nbt3;
+    bool     outb;
+} wsp_ggml_metal_kargs_cumsum_blk;
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int64_t  net0;
+    int64_t  net1;
+    int64_t  net2;
+    int64_t  net3;
+    uint64_t nbt0;
+    uint64_t nbt1;
+    uint64_t nbt2;
+    uint64_t nbt3;
+} wsp_ggml_metal_kargs_cumsum_add;
 typedef struct {
     int32_t  ne00;
     int32_t  ne01;
@@ -762,10 +832,28 @@ typedef struct {
 } wsp_ggml_metal_kargs_leaky_relu;
 typedef struct {
-    int64_t  ncols;
-    int64_t  ncols_pad;
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
 } wsp_ggml_metal_kargs_argsort;
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  len;
+} wsp_ggml_metal_kargs_argsort_merge;
 typedef struct {
     int64_t  ne0;
     float    start;