npm - whisper.rn - Versions diffs - 0.5.0-rc.9 → 0.5.1 - Mend

whisper.rn 0.5.0-rc.9 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

package/cpp/ggml-metal/ggml-metal-device.m ADDED Viewed

@@ -0,0 +1,1312 @@
+#import "ggml-metal-device.h"
+#import "ggml-impl.h"
+#import "ggml-threading.h"
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#ifndef TARGET_OS_VISION
+#define TARGET_OS_VISION 0
+#endif
+// create residency sets only on macOS >= 15.0
+#if !TARGET_CPU_X86_64 && TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
+    TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
+    TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
+    TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
+#define WSP_GGML_METAL_HAS_RESIDENCY_SETS 1
+#endif
+// overload of MTLGPUFamilyMetal3 (not available in some environments)
+static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+#if !WSP_GGML_METAL_EMBED_LIBRARY
+// Here to assist with NSBundle Path Hack
+@interface WSPGGMLMetalClass : NSObject
+@end
+@implementation WSPGGMLMetalClass
+@end
+#endif
+//
+// MTLFunctionConstantValues wrapper
+//
+struct wsp_ggml_metal_cv {
+    MTLFunctionConstantValues * obj;
+};
+wsp_ggml_metal_cv_t wsp_ggml_metal_cv_init(void) {
+    wsp_ggml_metal_cv_t res = calloc(1, sizeof(struct wsp_ggml_metal_cv));
+    res->obj = [[MTLFunctionConstantValues alloc] init];
+    return res;
+}
+void wsp_ggml_metal_cv_free(wsp_ggml_metal_cv_t cv) {
+    [cv->obj release];
+    free(cv);
+}
+void wsp_ggml_metal_cv_set_int16(wsp_ggml_metal_cv_t cv, int16_t value, int32_t idx) {
+    [cv->obj setConstantValue:&value type:MTLDataTypeShort atIndex:idx];
+}
+void wsp_ggml_metal_cv_set_int32(wsp_ggml_metal_cv_t cv, int32_t value, int32_t idx) {
+    [cv->obj setConstantValue:&value type:MTLDataTypeInt atIndex:idx];
+}
+void wsp_ggml_metal_cv_set_bool(wsp_ggml_metal_cv_t cv, bool value, int32_t idx) {
+    [cv->obj setConstantValue:&value type:MTLDataTypeBool atIndex:idx];
+}
+//
+// MTLComputePipelineState wrapper
+//
+struct wsp_ggml_metal_pipeline {
+    id<MTLComputePipelineState> obj;
+    // suggested dispatch sizes
+    int nsg;
+    int nr0;
+    int nr1;
+    size_t smem;
+};
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_pipeline_init(void) {
+    wsp_ggml_metal_pipeline_t res = calloc(1, sizeof(struct wsp_ggml_metal_pipeline));
+    *res = (struct wsp_ggml_metal_pipeline) {
+        /*.obj  =*/ nil,
+        /*.nsg  =*/ 0,
+        /*.nr0  =*/ 0,
+        /*.nr1  =*/ 0,
+        /*.smem =*/ 0,
+    };
+    return res;
+}
+void wsp_ggml_metal_pipeline_free(wsp_ggml_metal_pipeline_t pipeline) {
+    [pipeline->obj release];
+    free(pipeline);
+}
+void wsp_ggml_metal_pipeline_set_nsg(wsp_ggml_metal_pipeline_t pipeline, int nsg) {
+    pipeline->nsg = nsg;
+}
+int wsp_ggml_metal_pipeline_get_nsg(wsp_ggml_metal_pipeline_t pipeline) {
+    return pipeline->nsg;
+}
+void wsp_ggml_metal_pipeline_set_nr0(wsp_ggml_metal_pipeline_t pipeline, int nr0) {
+    pipeline->nr0 = nr0;
+}
+int wsp_ggml_metal_pipeline_get_nr0(wsp_ggml_metal_pipeline_t pipeline) {
+    return pipeline->nr0;
+}
+void wsp_ggml_metal_pipeline_set_nr1(wsp_ggml_metal_pipeline_t pipeline, int nr1) {
+    pipeline->nr1 = nr1;
+}
+int wsp_ggml_metal_pipeline_get_nr1(wsp_ggml_metal_pipeline_t pipeline) {
+    return pipeline->nr1;
+}
+void   wsp_ggml_metal_pipeline_set_smem(wsp_ggml_metal_pipeline_t pipeline, size_t smem) {
+    pipeline->smem = smem;
+}
+size_t wsp_ggml_metal_pipeline_get_smem(wsp_ggml_metal_pipeline_t pipeline) {
+    return pipeline->smem;
+}
+int wsp_ggml_metal_pipeline_max_theads_per_threadgroup(wsp_ggml_metal_pipeline_t pipeline) {
+    return pipeline->obj.maxTotalThreadsPerThreadgroup;
+}
+struct wsp_ggml_metal_library {
+    id<MTLLibrary> obj;
+    id<MTLDevice> device;
+    wsp_ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
+};
+wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev) {
+    id<MTLLibrary> library = nil;
+    id<MTLDevice> device = wsp_ggml_metal_device_get_obj(dev);
+    // load library
+    //
+    // - first check if the library is embedded
+    // - then check if the library is in the bundle
+    // - if not found, load the source and compile it
+    // - if that fails, return NULL
+    //
+    // TODO: move to a function
+    {
+        const int64_t t_start = wsp_ggml_time_us();
+        NSError * error = nil;
+        NSString * src = nil;
+#if WSP_GGML_METAL_EMBED_LIBRARY
+        WSP_GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
+        extern const char wsp_ggml_metallib_start[];
+        extern const char wsp_ggml_metallib_end[];
+        src = [[NSString alloc] initWithBytes:wsp_ggml_metallib_start length:(wsp_ggml_metallib_end-wsp_ggml_metallib_start) encoding:NSUTF8StringEncoding];
+#else
+#ifdef SWIFT_PACKAGE
+        NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
+#else
+        NSBundle * bundle = [NSBundle bundleForClass:[WSPGGMLMetalClass class]];
+#endif
+#if TARGET_OS_SIMULATOR
+        NSString * path_lib = [bundle pathForResource:@"ggml-whisper-sim" ofType:@"metallib"];
+#else
+        NSString * path_lib = [bundle pathForResource:@"ggml-whisper" ofType:@"metallib"];
+#endif
+        if (path_lib == nil) {
+            // Try to find the resource in the directory where the current binary located.
+            NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
+            NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent];
+            NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
+            if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
+                WSP_GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]);
+                NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error];
+                if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
+                    // Optionally, if this is a symlink, try to resolve it.
+                    path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error];
+                    if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) {
+                        // It is a relative path, adding the binary directory as directory prefix.
+                        path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]];
+                    }
+                    if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
+                        // Link to the resource could not be resolved.
+                        path_lib_default = nil;
+                    } else {
+                        WSP_GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]);
+                    }
+                }
+            } else {
+                // The resource couldn't be found in the binary's directory.
+                path_lib_default = nil;
+            }
+            path_lib = path_lib_default;
+        }
+        if (path_lib != nil) {
+            // pre-compiled library found
+            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
+            WSP_GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
+            library = [device newLibraryWithURL:libURL error:&error];
+            if (error) {
+                WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return nil;
+            }
+        } else {
+            WSP_GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
+            NSString * path_source;
+            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"WSP_GGML_METAL_PATH_RESOURCES"];
+            WSP_GGML_LOG_INFO("%s: WSP_GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
+            if (path_resource) {
+                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
+            } else {
+                path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+            }
+            if (path_source == nil) {
+                WSP_GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
+                path_source = @"ggml-metal.metal";
+            }
+            WSP_GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
+            src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
+            if (error) {
+                WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return nil;
+            }
+        }
+#endif
+        if (!library) {
+            @autoreleasepool {
+                // dictionary of preprocessor macros
+                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+                if (wsp_ggml_metal_device_get_props(dev)->has_bfloat) {
+                    [prep setObject:@"1" forKey:@"WSP_GGML_METAL_HAS_BF16"];
+                }
+#if WSP_GGML_METAL_EMBED_LIBRARY
+                [prep setObject:@"1" forKey:@"WSP_GGML_METAL_EMBED_LIBRARY"];
+#endif
+                MTLCompileOptions * options = [MTLCompileOptions new];
+                options.preprocessorMacros = prep;
+                //[options setFastMathEnabled:false];
+                library = [device newLibraryWithSource:src options:options error:&error];
+                if (error) {
+                    WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    return nil;
+                }
+#if !__has_feature(objc_arc)
+                [options release];
+#endif
+            }
+        }
+#if WSP_GGML_METAL_EMBED_LIBRARY
+        [src release];
+#endif // WSP_GGML_METAL_EMBED_LIBRARY
+        WSP_GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (wsp_ggml_time_us() - t_start) / 1e6);
+    }
+    wsp_ggml_metal_library_t res = calloc(1, sizeof(struct wsp_ggml_metal_library));
+    res->obj = library;
+    res->device = device;
+    res->pipelines = wsp_ggml_metal_pipelines_init();
+    return res;
+}
+void wsp_ggml_metal_library_free(wsp_ggml_metal_library_t lib) {
+    if (!lib) {
+        return;
+    }
+    if (lib->obj) {
+        [lib->obj release];
+    }
+    wsp_ggml_metal_pipelines_free(lib->pipelines);
+    free(lib);
+}
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline(wsp_ggml_metal_library_t lib, const char * name) {
+    return wsp_ggml_metal_pipelines_get(lib->pipelines, name);
+}
+wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_compile_pipeline(wsp_ggml_metal_library_t lib, const char * base, const char * name, wsp_ggml_metal_cv_t cv) {
+    // note: the pipelines are cached in the library per device, so they are shared across all metal contexts
+    wsp_ggml_critical_section_start();
+    wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
+    if (res) {
+        wsp_ggml_critical_section_end();
+        return res;
+    }
+    res = wsp_ggml_metal_pipeline_init();
+    @autoreleasepool {
+        NSError * error = nil;
+        NSString * base_func = [NSString stringWithUTF8String:base];
+        WSP_GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name);
+        id<MTLFunction> mtl_function;
+        if (!cv) {
+            mtl_function = [lib->obj newFunctionWithName:base_func];
+        } else {
+            mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
+        }
+        if (!mtl_function) {
+            wsp_ggml_critical_section_end();
+            WSP_GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
+            if (error) {
+                WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            }
+            return nil;
+        }
+        res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
+        wsp_ggml_metal_pipelines_add(lib->pipelines, name, res);
+        [mtl_function release];
+        WSP_GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
+                (int) res->obj.maxTotalThreadsPerThreadgroup,
+                (int) res->obj.threadExecutionWidth);
+    }
+    wsp_ggml_critical_section_end();
+    return res;
+}
+//
+// MTLComputeCommandEncoder wrapper
+//
+struct wsp_ggml_metal_encoder {
+    id<MTLComputeCommandEncoder> obj;
+};
+wsp_ggml_metal_encoder_t wsp_ggml_metal_encoder_init(wsp_ggml_metal_cmd_buf_t cmd_buf_raw, bool concurrent) {
+    wsp_ggml_metal_encoder_t res = calloc(1, sizeof(struct wsp_ggml_metal_encoder));
+    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
+    if (concurrent) {
+        res->obj = [cmd_buf computeCommandEncoderWithDispatchType: MTLDispatchTypeConcurrent];
+    } else {
+        res->obj = [cmd_buf computeCommandEncoder];
+    }
+    [res->obj retain];
+    return res;
+}
+void wsp_ggml_metal_encoder_free(wsp_ggml_metal_encoder_t encoder) {
+    [encoder->obj release];
+    free(encoder);
+}
+void wsp_ggml_metal_encoder_debug_group_push(wsp_ggml_metal_encoder_t encoder, const char * name) {
+    [encoder->obj pushDebugGroup:[NSString stringWithCString:name encoding:NSUTF8StringEncoding]];
+}
+void wsp_ggml_metal_encoder_debug_group_pop (wsp_ggml_metal_encoder_t encoder) {
+    [encoder->obj popDebugGroup];
+}
+void wsp_ggml_metal_encoder_set_pipeline(wsp_ggml_metal_encoder_t encoder, wsp_ggml_metal_pipeline_t pipeline) {
+    [encoder->obj setComputePipelineState:pipeline->obj];
+}
+void wsp_ggml_metal_encoder_set_bytes(wsp_ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
+    [encoder->obj setBytes:data length:size atIndex:idx];
+}
+void wsp_ggml_metal_encoder_set_buffer(wsp_ggml_metal_encoder_t encoder, struct wsp_ggml_metal_buffer_id buffer, int idx) {
+    [encoder->obj setBuffer:buffer.metal offset:buffer.offs atIndex:idx];
+}
+void wsp_ggml_metal_encoder_set_threadgroup_memory_size(wsp_ggml_metal_encoder_t encoder, size_t size, int idx) {
+    [encoder->obj setThreadgroupMemoryLength:size atIndex:idx];
+}
+void wsp_ggml_metal_encoder_dispatch_threadgroups(wsp_ggml_metal_encoder_t encoder, int tg0, int tg1, int tg2, int tptg0, int tptg1, int tptg2) {
+    [encoder->obj dispatchThreadgroups:MTLSizeMake(tg0, tg1, tg2) threadsPerThreadgroup:MTLSizeMake(tptg0, tptg1, tptg2)];
+}
+void wsp_ggml_metal_encoder_memory_barrier(wsp_ggml_metal_encoder_t encoder) {
+    [encoder->obj memoryBarrierWithScope:MTLBarrierScopeBuffers];
+}
+void wsp_ggml_metal_encoder_end_encoding(wsp_ggml_metal_encoder_t encoder) {
+    [encoder->obj endEncoding];
+}
+struct wsp_ggml_metal_device {
+    id<MTLDevice> mtl_device;
+    // a single global queue shared by all Metal backends
+    // technically not needed for devices with unified memory, but enables discrete GPUs support
+    // ref: https://github.com/ggml-org/llama.cpp/pull/15906
+    id<MTLCommandQueue> mtl_queue;
+    wsp_ggml_metal_library_t library;
+    struct wsp_ggml_metal_device_props props;
+};
+wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
+    wsp_ggml_metal_device_t dev = calloc(1, sizeof(struct wsp_ggml_metal_device));
+    assert(dev != NULL);
+    if (dev->mtl_device == nil) {
+        dev->mtl_device = MTLCreateSystemDefaultDevice();
+        if (dev->mtl_device) {
+            dev->mtl_queue = [dev->mtl_device newCommandQueue];
+            if (dev->mtl_queue == nil) {
+                WSP_GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
+            }
+            dev->props.has_simdgroup_reduction  = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
+            dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+            dev->props.has_simdgroup_mm = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
+            dev->props.has_unified_memory = dev->mtl_device.hasUnifiedMemory;
+            dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+            dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
+            dev->props.use_residency_sets = true;
+#if defined(WSP_GGML_METAL_HAS_RESIDENCY_SETS)
+            dev->props.use_residency_sets = getenv("WSP_GGML_METAL_NO_RESIDENCY") == nil;
+#endif
+            dev->props.use_shared_buffers = dev->props.has_unified_memory;
+            if (getenv("WSP_GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
+                dev->props.use_shared_buffers = false;
+            }
+            dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
+            dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
+            dev->props.max_working_set_size       = dev->mtl_device.recommendedMaxWorkingSetSize;
+            dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
+            strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
+            dev->library = wsp_ggml_metal_library_init(dev);
+            if (!dev->library) {
+                WSP_GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
+            }
+            // --------------------------------------------------
+            // print MTL GPU family:
+            WSP_GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);
+            // determine max supported GPU family
+            // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+            // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+            {
+                for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
+                    if ([dev->mtl_device supportsFamily:i]) {
+                        WSP_GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
+                        break;
+                    }
+                }
+                for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
+                    if ([dev->mtl_device supportsFamily:i]) {
+                        WSP_GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
+                        break;
+                    }
+                }
+                for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) {
+                    if ([dev->mtl_device supportsFamily:i]) {
+                        WSP_GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i);
+                        break;
+                    }
+                }
+            }
+            WSP_GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, dev->props.has_simdgroup_reduction ? "true" : "false");
+            WSP_GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm        ? "true" : "false");
+            WSP_GGML_LOG_INFO("%s: has unified memory    = %s\n", __func__, dev->props.has_unified_memory      ? "true" : "false");
+            WSP_GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, dev->props.has_bfloat              ? "true" : "false");
+            WSP_GGML_LOG_INFO("%s: use residency sets    = %s\n", __func__, dev->props.use_residency_sets      ? "true" : "false");
+            WSP_GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, dev->props.use_shared_buffers      ? "true" : "false");
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+            if (@available(macOS 10.12, iOS 16.0, *)) {
+                WSP_GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, dev->props.max_working_set_size / 1e6);
+            }
+#endif
+        }
+    }
+    return dev;
+}
+void wsp_ggml_metal_device_free(wsp_ggml_metal_device_t dev) {
+    assert(dev != NULL);
+    wsp_ggml_metal_library_free(dev->library);
+    dev->library = NULL;
+    if (dev->mtl_queue) {
+        [dev->mtl_queue release];
+        dev->mtl_queue = nil;
+    }
+    if (dev->mtl_device) {
+        [dev->mtl_device release];
+        dev->mtl_device = nil;
+    }
+    free(dev);
+}
+void * wsp_ggml_metal_device_get_obj(wsp_ggml_metal_device_t dev) {
+    return dev->mtl_device;
+}
+void * wsp_ggml_metal_device_get_queue(wsp_ggml_metal_device_t dev) {
+    return dev->mtl_queue;
+}
+wsp_ggml_metal_library_t wsp_ggml_metal_device_get_library(wsp_ggml_metal_device_t dev) {
+    return dev->library;
+}
+void wsp_ggml_metal_device_get_memory(wsp_ggml_metal_device_t dev, size_t * free, size_t * total) {
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        *total = dev->mtl_device.recommendedMaxWorkingSetSize;
+        *free  = *total - dev->mtl_device.currentAllocatedSize;
+    } else {
+        *free = 0;
+        *total = 0;
+    }
+}
+bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct wsp_ggml_tensor * op) {
+    const bool has_simdgroup_mm        = dev->props.has_simdgroup_mm;
+    const bool has_simdgroup_reduction = dev->props.has_simdgroup_reduction;
+    const bool has_bfloat              = dev->props.has_bfloat;
+    if (!has_bfloat) {
+        if (op->type == WSP_GGML_TYPE_BF16) {
+            return false;
+        }
+        for (size_t i = 0, n = 3; i < n; ++i) {
+            if (op->src[i] != NULL && op->src[i]->type == WSP_GGML_TYPE_BF16) {
+                return false;
+            }
+        }
+    }
+    switch (op->op) {
+        case WSP_GGML_OP_UNARY:
+            switch (wsp_ggml_get_unary_op(op)) {
+                case WSP_GGML_UNARY_OP_TANH:
+                case WSP_GGML_UNARY_OP_RELU:
+                case WSP_GGML_UNARY_OP_SIGMOID:
+                case WSP_GGML_UNARY_OP_GELU:
+                case WSP_GGML_UNARY_OP_GELU_ERF:
+                case WSP_GGML_UNARY_OP_GELU_QUICK:
+                case WSP_GGML_UNARY_OP_SILU:
+                case WSP_GGML_UNARY_OP_ELU:
+                case WSP_GGML_UNARY_OP_NEG:
+                case WSP_GGML_UNARY_OP_ABS:
+                case WSP_GGML_UNARY_OP_SGN:
+                case WSP_GGML_UNARY_OP_STEP:
+                case WSP_GGML_UNARY_OP_HARDSWISH:
+                case WSP_GGML_UNARY_OP_HARDSIGMOID:
+                case WSP_GGML_UNARY_OP_EXP:
+                    return wsp_ggml_is_contiguous(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
+                default:
+                    return false;
+            }
+        case WSP_GGML_OP_GLU:
+            switch (wsp_ggml_get_glu_op(op)) {
+                case WSP_GGML_GLU_OP_REGLU:
+                case WSP_GGML_GLU_OP_GEGLU:
+                case WSP_GGML_GLU_OP_SWIGLU:
+                case WSP_GGML_GLU_OP_SWIGLU_OAI:
+                case WSP_GGML_GLU_OP_GEGLU_ERF:
+                case WSP_GGML_GLU_OP_GEGLU_QUICK:
+                    return wsp_ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
+               default:
+                    return false;
+            }
+        case WSP_GGML_OP_NONE:
+        case WSP_GGML_OP_RESHAPE:
+        case WSP_GGML_OP_VIEW:
+        case WSP_GGML_OP_TRANSPOSE:
+        case WSP_GGML_OP_PERMUTE:
+        case WSP_GGML_OP_CONCAT:
+            return true;
+        case WSP_GGML_OP_ADD:
+        case WSP_GGML_OP_SUB:
+        case WSP_GGML_OP_MUL:
+        case WSP_GGML_OP_DIV:
+        case WSP_GGML_OP_ADD_ID:
+            return op->src[0]->type == WSP_GGML_TYPE_F32;
+        case WSP_GGML_OP_ACC:
+        case WSP_GGML_OP_REPEAT:
+        case WSP_GGML_OP_SCALE:
+        case WSP_GGML_OP_CONV_TRANSPOSE_1D:
+            return true;
+        case WSP_GGML_OP_CLAMP:
+            return op->src[0]->type == WSP_GGML_TYPE_F32;
+        case WSP_GGML_OP_SQR:
+        case WSP_GGML_OP_SQRT:
+        case WSP_GGML_OP_SIN:
+        case WSP_GGML_OP_COS:
+        case WSP_GGML_OP_LOG:
+            return wsp_ggml_is_contiguous(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
+        case WSP_GGML_OP_SUM_ROWS:
+        case WSP_GGML_OP_MEAN:
+        case WSP_GGML_OP_SOFT_MAX:
+        case WSP_GGML_OP_GROUP_NORM:
+            return has_simdgroup_reduction && wsp_ggml_is_contiguous_rows(op->src[0]);
+        case WSP_GGML_OP_L2_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && wsp_ggml_is_contiguous_1(op->src[0]));
+        case WSP_GGML_OP_ARGMAX:
+            return has_simdgroup_reduction;
+        case WSP_GGML_OP_NORM:
+        case WSP_GGML_OP_RMS_NORM:
+            return has_simdgroup_reduction && (wsp_ggml_is_contiguous_rows(op->src[0]));
+        case WSP_GGML_OP_ROPE:
+            return true;
+        case WSP_GGML_OP_IM2COL:
+            return wsp_ggml_is_contiguous(op->src[1]) && op->src[1]->type == WSP_GGML_TYPE_F32 && (op->type == WSP_GGML_TYPE_F16 || op->type == WSP_GGML_TYPE_F32);
+        case WSP_GGML_OP_POOL_1D:
+            return false;
+        case WSP_GGML_OP_UPSCALE:
+            return op->src[0]->type == WSP_GGML_TYPE_F32 && op->op_params[0] == WSP_GGML_SCALE_MODE_NEAREST;
+        case WSP_GGML_OP_POOL_2D:
+            return op->src[0]->type == WSP_GGML_TYPE_F32;
+        case WSP_GGML_OP_PAD:
+            return (wsp_ggml_get_op_params_i32(op, 0) == 0) && (wsp_ggml_get_op_params_i32(op, 2) == 0) &&
+                   (wsp_ggml_get_op_params_i32(op, 4) == 0) && (wsp_ggml_get_op_params_i32(op, 6) == 0);
+        case WSP_GGML_OP_PAD_REFLECT_1D:
+        case WSP_GGML_OP_TIMESTEP_EMBEDDING:
+        case WSP_GGML_OP_LEAKY_RELU:
+            return op->src[0]->type == WSP_GGML_TYPE_F32;
+        case WSP_GGML_OP_ARGSORT:
+            // TODO: Support arbitrary column width
+            return op->src[0]->ne[0] <= 1024;
+        case WSP_GGML_OP_ARANGE:
+            return true;
+        case WSP_GGML_OP_FLASH_ATTN_EXT:
+            // for new head sizes, add checks here
+            if (op->src[0]->ne[0] != 40 &&
+                op->src[0]->ne[0] != 64 &&
+                op->src[0]->ne[0] != 80 &&
+                op->src[0]->ne[0] != 96 &&
+                op->src[0]->ne[0] != 112 &&
+                op->src[0]->ne[0] != 128 &&
+                op->src[0]->ne[0] != 192 &&
+                op->src[0]->ne[0] != 256) {
+                return false;
+            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek sizes
+                // TODO: disabled for now, until optmized
+                return false;
+            }
+            if (op->src[1]->type != op->src[2]->type) {
+                return false;
+            }
+            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
+        case WSP_GGML_OP_SSM_CONV:
+        case WSP_GGML_OP_SSM_SCAN:
+            return has_simdgroup_reduction;
+        case WSP_GGML_OP_RWKV_WKV6:
+        case WSP_GGML_OP_RWKV_WKV7:
+            return true;
+        case WSP_GGML_OP_MUL_MAT:
+        case WSP_GGML_OP_MUL_MAT_ID:
+            return has_simdgroup_reduction;
+        case WSP_GGML_OP_CPY:
+        case WSP_GGML_OP_DUP:
+        case WSP_GGML_OP_CONT:
+            {
+                switch (op->src[0]->type) {
+                    case WSP_GGML_TYPE_F32:
+                        switch (op->type) {
+                           case WSP_GGML_TYPE_F32:
+                           case WSP_GGML_TYPE_F16:
+                           case WSP_GGML_TYPE_BF16:
+                           case WSP_GGML_TYPE_Q8_0:
+                           case WSP_GGML_TYPE_Q4_0:
+                           case WSP_GGML_TYPE_Q4_1:
+                           case WSP_GGML_TYPE_Q5_0:
+                           case WSP_GGML_TYPE_Q5_1:
+                           case WSP_GGML_TYPE_IQ4_NL:
+                           case WSP_GGML_TYPE_I32:
+                                return true;
+                           default:
+                                return false;
+                        }
+                    case WSP_GGML_TYPE_F16:
+                        switch (op->type) {
+                            case WSP_GGML_TYPE_F32:
+                            case WSP_GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    case WSP_GGML_TYPE_BF16:
+                        switch (op->type) {
+                            case WSP_GGML_TYPE_F32:
+                            case WSP_GGML_TYPE_BF16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    case WSP_GGML_TYPE_Q4_0:
+                    case WSP_GGML_TYPE_Q4_1:
+                    case WSP_GGML_TYPE_Q5_0:
+                    case WSP_GGML_TYPE_Q5_1:
+                    case WSP_GGML_TYPE_Q8_0:
+                        switch (op->type) {
+                            case WSP_GGML_TYPE_F32:
+                            case WSP_GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    case WSP_GGML_TYPE_I32:
+                        return op->type == WSP_GGML_TYPE_F32;
+                    default:
+                        return false;
+                };
+            }
+        case WSP_GGML_OP_GET_ROWS:
+            {
+                return op->ne[3] == 1;
+            }
+        case WSP_GGML_OP_SET_ROWS:
+            {
+                if (op->src[0]->type != WSP_GGML_TYPE_F32) {
+                    return false;
+                }
+                switch (op->type) {
+                    case WSP_GGML_TYPE_F32:
+                    case WSP_GGML_TYPE_F16:
+                    case WSP_GGML_TYPE_BF16:
+                    case WSP_GGML_TYPE_Q8_0:
+                    case WSP_GGML_TYPE_Q4_0:
+                    case WSP_GGML_TYPE_Q4_1:
+                    case WSP_GGML_TYPE_Q5_0:
+                    case WSP_GGML_TYPE_Q5_1:
+                    case WSP_GGML_TYPE_IQ4_NL:
+                        return true;
+                    default:
+                        return false;
+                };
+            }
+        default:
+            return false;
+    }
+}
+const struct wsp_ggml_metal_device_props * wsp_ggml_metal_device_get_props(wsp_ggml_metal_device_t dev) {
+    return &dev->props;
+}
+//
+// device buffers
+//
+// max memory buffers that can be mapped to the device
+#define WSP_GGML_METAL_MAX_BUFFERS 64
+struct wsp_ggml_metal_buffer_wrapper {
+    void   * data;
+    size_t   size;
+    id<MTLBuffer> metal;
+};
+struct wsp_ggml_metal_buffer {
+    void * all_data; // TODO: https://github.com/ggml-org/llama.cpp/pull/15985
+    size_t all_size;
+    // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
+    bool is_shared;
+    bool owned;
+    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
+    int n_buffers;
+    struct wsp_ggml_metal_buffer_wrapper buffers[WSP_GGML_METAL_MAX_BUFFERS];
+    bool use_residency_sets;
+    // optional MTLResidencySet
+    // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
+    id rset;
+    // pointers to global device objects
+    id<MTLDevice> device;
+    id<MTLCommandQueue> queue;
+};
+static void wsp_ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
+#ifndef WSP_GGML_METAL_NDEBUG
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        WSP_GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
+                __func__,
+                size_aligned / 1024.0 / 1024.0,
+                device.currentAllocatedSize / 1024.0 / 1024.0,
+                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+            WSP_GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+        }
+    } else {
+        WSP_GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
+                __func__,
+                size_aligned / 1024.0 / 1024.0,
+                device.currentAllocatedSize / 1024.0 / 1024.0);
+    }
+#endif
+#endif
+    WSP_GGML_UNUSED(device);
+    WSP_GGML_UNUSED(size_aligned);
+}
+// rset init
+static bool wsp_ggml_metal_buffer_rset_init(wsp_ggml_metal_buffer_t buf) {
+    buf->rset = nil;
+    if (!buf->use_residency_sets) {
+        return true;
+    }
+#if defined(WSP_GGML_METAL_HAS_RESIDENCY_SETS)
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
+        MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
+        desc.label = @"wsp_ggml_metal";
+        desc.initialCapacity = buf->n_buffers;
+        NSError * error;
+        buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
+        if (error) {
+            WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            [desc release];
+            return false;
+        }
+        [desc release];
+        for (int i = 0; i < buf->n_buffers; i++) {
+            [buf->rset addAllocation:buf->buffers[i].metal];
+        }
+        [buf->rset commit];
+        [buf->rset requestResidency];
+        return true;
+    }
+#endif
+    return true;
+}
+// rset free
+static void wsp_ggml_metal_buffer_rset_free(wsp_ggml_metal_buffer_t buf) {
+#if defined(WSP_GGML_METAL_HAS_RESIDENCY_SETS)
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
+        if (buf->rset) {
+            [buf->rset endResidency];
+            [buf->rset removeAllAllocations];
+            [buf->rset release];
+        }
+    }
+#else
+    WSP_GGML_UNUSED(buf);
+#endif
+}
+static void * wsp_ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+#if TARGET_OS_OSX
+    kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
+    if (err != KERN_SUCCESS) {
+        WSP_GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
+        return NULL;
+    }
+#else
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        WSP_GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+#endif
+    return data;
+}
+wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_init(wsp_ggml_metal_device_t dev, size_t size, bool shared) {
+    wsp_ggml_metal_buffer_t res = calloc(1, sizeof(struct wsp_ggml_metal_buffer));
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+    const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
+    shared = shared && props_dev->use_shared_buffers;
+    // allocate shared buffer if the device supports it and it is required by the buffer type
+    if (shared) {
+        res->all_data = wsp_ggml_metal_host_malloc(size_aligned);
+        res->is_shared = true;
+        res->owned = true;
+    } else {
+        // dummy, non-NULL value - we'll populate this after creating the Metal buffer below
+        res->all_data = (void *) 0x000000400ULL;
+        res->is_shared = false;
+    }
+    res->all_size = size_aligned;
+    res->device = wsp_ggml_metal_device_get_obj(dev);
+    res->queue  = wsp_ggml_metal_device_get_queue(dev);
+    res->n_buffers = 1;
+    if (res->all_data != NULL) {
+        res->buffers[0].size  = size;
+        res->buffers[0].metal = nil;
+        if (size_aligned > 0) {
+            if (props_dev->use_shared_buffers &&shared) {
+                res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
+                                                                  length:size_aligned
+                                                                 options:MTLResourceStorageModeShared
+                                                             deallocator:nil];
+            } else {
+                res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+                res->all_data = (void *) (res->buffers[0].metal.gpuAddress);
+            }
+        }
+        res->buffers[0].data = res->all_data;
+    }
+    if (size_aligned > 0 && (res->all_data == NULL || res->buffers[0].metal == nil)) {
+        WSP_GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+        free(res);
+        return NULL;
+    }
+    res->use_residency_sets = props_dev->use_residency_sets;
+    if (!wsp_ggml_metal_buffer_rset_init(res)) {
+        WSP_GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(res);
+        return NULL;
+    }
+    //wsp_ggml_metal_log_allocated_size(device, size_aligned);
+    return res;
+}
+wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    wsp_ggml_metal_buffer_t res = calloc(1, sizeof(struct wsp_ggml_metal_buffer));
+    res->all_data = ptr;
+    res->all_size = size;
+    res->is_shared = true;
+    res->owned = false;
+    res->n_buffers = 0;
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+    // page-align the data ptr
+    {
+        const uintptr_t offs = (uintptr_t) ptr % size_page;
+        ptr  = (void *) ((char *) ptr - offs);
+        size += offs;
+    }
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+    res->device = wsp_ggml_metal_device_get_obj(dev);
+    res->queue  = wsp_ggml_metal_device_get_queue(dev);
+    const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
+    // the buffer fits into the max buffer size allowed by the device
+    if (size_aligned <= props_dev->max_buffer_size) {
+        res->buffers[res->n_buffers].data  = ptr;
+        res->buffers[res->n_buffers].size  = size;
+        res->buffers[res->n_buffers].metal = nil;
+        if (size_aligned > 0) {
+            res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+            if (res->buffers[res->n_buffers].metal == nil) {
+                WSP_GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                free(res);
+                return NULL;
+            }
+        }
+        wsp_ggml_metal_log_allocated_size(res->device, size_aligned);
+        ++res->n_buffers;
+    } else {
+        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+        // one of the views
+        const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+        const size_t size_step = props_dev->max_buffer_size - size_ovlp;
+        const size_t size_view = props_dev->max_buffer_size;
+        for (size_t i = 0; i < size; i += size_step) {
+            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+            res->buffers[res->n_buffers].data  = (void *) ((uint8_t *) ptr + i);
+            res->buffers[res->n_buffers].size  = size_step_aligned;
+            res->buffers[res->n_buffers].metal = nil;
+            if (size_step_aligned > 0) {
+                res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+                if (res->buffers[res->n_buffers].metal == nil) {
+                    WSP_GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    free(res);
+                    return NULL;
+                }
+            }
+            wsp_ggml_metal_log_allocated_size(res->device, size_step_aligned);
+            if (i + size_step < size) {
+                WSP_GGML_LOG_INFO("\n");
+            }
+            ++res->n_buffers;
+        }
+    }
+    res->use_residency_sets = props_dev->use_residency_sets;
+    if (!wsp_ggml_metal_buffer_rset_init(res)) {
+        WSP_GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(res);
+        return NULL;
+    }
+    return res;
+}
+void wsp_ggml_metal_buffer_free(wsp_ggml_metal_buffer_t buf) {
+    for (int i = 0; i < buf->n_buffers; i++) {
+        [buf->buffers[i].metal release];
+    }
+    wsp_ggml_metal_buffer_rset_free(buf);
+    if (buf->is_shared && buf->owned) {
+#if TARGET_OS_OSX
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)buf->all_data, buf->all_size);
+#else
+        free(buf->all_data);
+#endif
+    }
+    free(buf);
+}
+void * wsp_ggml_metal_buffer_get_base(wsp_ggml_metal_buffer_t buf) {
+    return buf->all_data;
+}
+bool wsp_ggml_metal_buffer_is_shared(wsp_ggml_metal_buffer_t buf) {
+    return buf->is_shared;
+}
+void wsp_ggml_metal_buffer_memset_tensor(wsp_ggml_metal_buffer_t buf, struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    if (buf->is_shared) {
+        memset((char *)tensor->data + offset, value, size);
+        return;
+    }
+    @autoreleasepool {
+        // dst
+        struct wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_buffer_get_id(buf, tensor);
+        bid_dst.offs += offset;
+        id<MTLCommandQueue>  queue   = buf->queue;
+        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        {
+            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+            [encoder fillBuffer:bid_dst.metal
+                          range:NSMakeRange(bid_dst.offs, bid_dst.offs + size)
+                          value:value];
+            [encoder endEncoding];
+        }
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+}
+void wsp_ggml_metal_buffer_set_tensor(wsp_ggml_metal_buffer_t buf, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (buf->is_shared) {
+        memcpy((char *)tensor->data + offset, data, size);
+        return;
+    }
+    @autoreleasepool {
+        // src
+        void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
+        id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
+                                                               length:size
+                                                              options:MTLResourceStorageModeShared
+                                                          deallocator:nil];
+        WSP_GGML_ASSERT(buf_src);
+        // dst
+        struct wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_buffer_get_id(buf, tensor);
+        bid_dst.offs += offset;
+        // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete
+        //       this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
+        dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
+        id<MTLCommandQueue>  queue   = buf->queue;
+        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        {
+            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+            [encoder copyFromBuffer:buf_src
+                       sourceOffset:0
+                           toBuffer:bid_dst.metal
+                  destinationOffset:bid_dst.offs
+                               size:size];
+            [encoder endEncoding];
+        }
+        [cmd_buf addCompletedHandler:^(id<MTLCommandBuffer> cb) {
+                             // TODO: can check for errors here
+            WSP_GGML_UNUSED(cb);
+            dispatch_semaphore_signal(completion_semaphore);
+        }];
+        [cmd_buf commit];
+        dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER);
+        dispatch_release(completion_semaphore);
+        //[cmd_buf waitUntilCompleted];
+    }
+}
+void wsp_ggml_metal_buffer_get_tensor(wsp_ggml_metal_buffer_t buf, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    if (buf->is_shared) {
+        memcpy(data, (const char *)tensor->data + offset, size);
+        return;
+    }
+    @autoreleasepool {
+        // src
+        struct wsp_ggml_metal_buffer_id bid_src = wsp_ggml_metal_buffer_get_id(buf, tensor);
+        bid_src.offs += offset;
+        // dst
+        id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
+                                                               length:size
+                                                              options:MTLResourceStorageModeShared
+                                                          deallocator:nil];
+        WSP_GGML_ASSERT(buf_dst);
+        id<MTLCommandQueue>  queue   = buf->queue;
+        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        {
+            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+            [encoder copyFromBuffer:bid_src.metal
+                       sourceOffset:bid_src.offs
+                           toBuffer:buf_dst
+                  destinationOffset:0
+                               size:size];
+            [encoder endEncoding];
+        }
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+}
+void wsp_ggml_metal_buffer_clear(wsp_ggml_metal_buffer_t buf, uint8_t value) {
+    if (buf->is_shared) {
+        memset(buf->all_data, value, buf->all_size);
+        return;
+    }
+    @autoreleasepool {
+        id<MTLCommandQueue>  queue   = buf->queue;
+        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        {
+            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+            [encoder fillBuffer:buf->buffers[0].metal
+                          range:NSMakeRange(0, buf->buffers[0].size)
+                          value:value];
+            [encoder endEncoding];
+        }
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+}
+struct wsp_ggml_metal_buffer_id wsp_ggml_metal_buffer_get_id(wsp_ggml_metal_buffer_t buf, const struct wsp_ggml_tensor * t) {
+    struct wsp_ggml_metal_buffer_id res = { nil, 0 };
+    const int64_t tsize = wsp_ggml_nbytes(t);
+    // find the view that contains the tensor fully
+    for (int i = 0; i < buf->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf->buffers[i].data;
+        //WSP_GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf->buffers[i].size) {
+            res.metal = buf->buffers[i].metal;
+            res.offs  = (size_t) ioffs;
+            //WSP_GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
+            return res;
+        }
+    }
+    WSP_GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
+    return res;
+}