npm - whisper.rn - Versions diffs - 0.4.0-rc.4 → 0.4.0-rc.5 - Mend

whisper.rn 0.4.0-rc.4 → 0.4.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +6 -6
package/android/build.gradle +4 -0
package/android/src/main/CMakeLists.txt +5 -0
package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
package/android/src/main/java/com/rnwhisper/WhisperContext.java +51 -133
package/android/src/main/jni-utils.h +76 -0
package/android/src/main/jni.cpp +187 -112
package/cpp/README.md +1 -1
package/cpp/coreml/whisper-encoder-impl.h +1 -1
package/cpp/coreml/whisper-encoder.h +4 -0
package/cpp/coreml/whisper-encoder.mm +4 -2
package/cpp/ggml-alloc.c +55 -19
package/cpp/ggml-alloc.h +7 -0
package/cpp/ggml-backend-impl.h +46 -21
package/cpp/ggml-backend.c +563 -156
package/cpp/ggml-backend.h +62 -17
package/cpp/ggml-impl.h +1 -1
package/cpp/ggml-metal-whisper.metal +1010 -253
package/cpp/ggml-metal.h +7 -1
package/cpp/ggml-metal.m +618 -187
package/cpp/ggml-quants.c +64 -59
package/cpp/ggml-quants.h +40 -40
package/cpp/ggml.c +751 -1466
package/cpp/ggml.h +90 -25
package/cpp/rn-audioutils.cpp +68 -0
package/cpp/rn-audioutils.h +14 -0
package/cpp/rn-whisper-log.h +11 -0
package/cpp/rn-whisper.cpp +141 -59
package/cpp/rn-whisper.h +47 -15
package/cpp/whisper.cpp +1635 -928
package/cpp/whisper.h +55 -10
package/ios/RNWhisper.mm +7 -7
package/ios/RNWhisperAudioUtils.h +0 -2
package/ios/RNWhisperAudioUtils.m +0 -56
package/ios/RNWhisperContext.h +3 -11
package/ios/RNWhisperContext.mm +62 -134
package/lib/commonjs/version.json +1 -1
package/lib/module/version.json +1 -1
package/package.json +6 -5
package/src/version.json +1 -1

package/cpp/ggml-backend.c CHANGED Viewed

@@ -9,14 +9,36 @@
 #include <stdlib.h>
 #include <string.h>
-#define UNUSED WSP_GGML_UNUSED
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
+// backend buffer type
+wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
+    return buft->iface.alloc_buffer(buft, size);
+}
+size_t wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_alignment(buft);
+}
+size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor) {
+    // get_alloc_size is optional, defaults to wsp_ggml_nbytes
+    if (buft->iface.get_alloc_size) {
+        return buft->iface.get_alloc_size(buft, tensor);
+    }
+    return wsp_ggml_nbytes(tensor);
+}
+bool wsp_ggml_backend_buft_supports_backend(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend) {
+    return buft->iface.supports_backend(buft, backend);
+}
 // backend buffer
 wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
-        struct wsp_ggml_backend                  * backend,
+               wsp_ggml_backend_buffer_type_t      buft,
         struct wsp_ggml_backend_buffer_i           iface,
                wsp_ggml_backend_buffer_context_t   context,
                size_t                          size) {
@@ -26,7 +48,7 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
     (*buffer) = (struct wsp_ggml_backend_buffer) {
         /* .interface = */ iface,
-        /* .backend   = */ backend,
+        /* .buft      = */ buft,
         /* .context   = */ context,
         /* .size      = */ size,
     };
@@ -45,10 +67,6 @@ void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
     free(buffer);
 }
-size_t wsp_ggml_backend_buffer_get_alignment(wsp_ggml_backend_buffer_t buffer) {
-    return wsp_ggml_backend_get_alignment(buffer->backend);
-}
 size_t wsp_ggml_backend_buffer_get_size(wsp_ggml_backend_buffer_t buffer) {
     return buffer->size;
 }
@@ -61,14 +79,6 @@ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
     return base;
 }
-size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
-    // get_alloc_size is optional, defaults to wsp_ggml_nbytes
-    if (buffer->iface.get_alloc_size) {
-        return buffer->iface.get_alloc_size(buffer, tensor);
-    }
-    return wsp_ggml_nbytes(tensor);
-}
 void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
     // init_tensor is optional
     if (buffer->iface.init_tensor) {
@@ -76,19 +86,20 @@ void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struc
     }
 }
-void wsp_ggml_backend_buffer_free_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
-    // free_tensor is optional
-    if (buffer->iface.free_tensor) {
-        buffer->iface.free_tensor(buffer, tensor);
-    }
+size_t wsp_ggml_backend_buffer_get_alignment (wsp_ggml_backend_buffer_t buffer) {
+    return wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type(buffer));
 }
-// backend
+size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
+    return wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type(buffer), tensor);
+}
-wsp_ggml_backend_t wsp_ggml_get_backend(const struct wsp_ggml_tensor * tensor) {
-    return tensor->buffer ? tensor->buffer->backend : NULL;
+wsp_ggml_backend_buffer_type_t wsp_ggml_backend_buffer_type(wsp_ggml_backend_buffer_t buffer) {
+    return buffer->buft;
 }
+// backend
 const char * wsp_ggml_backend_name(wsp_ggml_backend_t backend) {
     if (backend == NULL) {
         return "NULL";
@@ -104,43 +115,53 @@ void wsp_ggml_backend_free(wsp_ggml_backend_t backend) {
     backend->iface.free(backend);
 }
+wsp_ggml_backend_buffer_type_t wsp_ggml_backend_get_default_buffer_type(wsp_ggml_backend_t backend) {
+    return backend->iface.get_default_buffer_type(backend);
+}
 wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_buffer(wsp_ggml_backend_t backend, size_t size) {
-    return backend->iface.alloc_buffer(backend, size);
+    return wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_get_default_buffer_type(backend), size);
 }
 size_t wsp_ggml_backend_get_alignment(wsp_ggml_backend_t backend) {
-    return backend->iface.get_alignment(backend);
+    return wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_get_default_buffer_type(backend));
 }
-void wsp_ggml_backend_tensor_set_async(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    wsp_ggml_get_backend(tensor)->iface.set_tensor_async(wsp_ggml_get_backend(tensor), tensor, data, offset, size);
+void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
+    backend->iface.set_tensor_async(backend, tensor, data, offset, size);
 }
-void wsp_ggml_backend_tensor_get_async(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    wsp_ggml_get_backend(tensor)->iface.get_tensor_async(wsp_ggml_get_backend(tensor), tensor, data, offset, size);
+void wsp_ggml_backend_tensor_get_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
+    backend->iface.get_tensor_async(backend, tensor, data, offset, size);
 }
 void wsp_ggml_backend_tensor_set(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    wsp_ggml_backend_t backend = wsp_ggml_get_backend(tensor);
     WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    WSP_GGML_ASSERT(backend != NULL && "tensor backend not set");
+    WSP_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
+    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
-    backend->iface.set_tensor_async(backend, tensor, data, offset, size);
-    backend->iface.synchronize(backend);
+    tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
 }
 void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    wsp_ggml_backend_t backend = wsp_ggml_get_backend(tensor);
     WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    WSP_GGML_ASSERT(backend != NULL && "tensor backend not set");
+    WSP_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
+    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
-    backend->iface.get_tensor_async(backend, tensor, data, offset, size);
-    backend->iface.synchronize(backend);
+    tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
 }
 void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
+    if (backend->iface.synchronize == NULL) {
+        return;
+    }
     backend->iface.synchronize(backend);
 }
@@ -154,10 +175,16 @@ void wsp_ggml_backend_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backe
 void wsp_ggml_backend_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
     backend->iface.graph_plan_compute(backend, plan);
+    // TODO: optional sync
+    wsp_ggml_backend_synchronize(backend);
 }
 void wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
     backend->iface.graph_compute(backend, cgraph);
+    // TODO: optional sync
+    wsp_ggml_backend_synchronize(backend);
 }
 bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
@@ -194,14 +221,15 @@ void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_
     // TODO: allow backends to support copy to/from same backend
-    if (wsp_ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
-        wsp_ggml_get_backend(dst)->iface.cpy_tensor_from(wsp_ggml_get_backend(dst)->context, src, dst);
-    } else if (wsp_ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
-        wsp_ggml_get_backend(src)->iface.cpy_tensor_to(wsp_ggml_get_backend(src)->context, src, dst);
+    if (dst->buffer->iface.cpy_tensor_from != NULL) {
+        dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
+    } else if (src->buffer->iface.cpy_tensor_to != NULL) {
+        src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
     } else {
         // shouldn't be hit when copying from/to CPU
         #ifndef NDEBUG
-        fprintf(stderr, "wsp_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", wsp_ggml_backend_name(src->buffer->backend), wsp_ggml_backend_name(dst->buffer->backend));
+        fprintf(stderr, "wsp_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
+                        "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
         #endif
         size_t nbytes = wsp_ggml_nbytes(src);
         void * data = malloc(nbytes);
@@ -211,101 +239,259 @@ void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_
     }
 }
-// backend CPU
+// backend registry
-struct wsp_ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
+#define WSP_GGML_MAX_BACKENDS_REG 16
+struct wsp_ggml_backend_reg {
+    char name[128];
+    wsp_ggml_backend_init_fn init_fn;
+    wsp_ggml_backend_buffer_type_t default_buffer_type;
+    void * user_data;
 };
-static const char * wsp_ggml_backend_cpu_name(wsp_ggml_backend_t backend) {
-    return "CPU";
+static struct wsp_ggml_backend_reg wsp_ggml_backend_registry[WSP_GGML_MAX_BACKENDS_REG];
+static size_t wsp_ggml_backend_registry_count = 0;
+static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, void * user_data);
+static void wsp_ggml_backend_registry_init(void) {
+    static bool initialized = false;
+    if (initialized) {
+        return;
+    }
+    initialized = true;
-    UNUSED(backend);
+    wsp_ggml_backend_register("CPU", wsp_ggml_backend_reg_cpu_init, wsp_ggml_backend_cpu_buffer_type(), NULL);
+    // add forward decls here to avoid including the backend headers
+#ifdef WSP_GGML_USE_CUBLAS
+    extern void wsp_ggml_backend_cuda_reg_devices(void);
+    wsp_ggml_backend_cuda_reg_devices();
+#endif
+#ifdef WSP_GGML_USE_METAL
+    extern wsp_ggml_backend_t wsp_ggml_backend_reg_metal_init(const char * params, void * user_data);
+    extern wsp_ggml_backend_buffer_type_t wsp_ggml_backend_metal_buffer_type(void);
+    wsp_ggml_backend_register("Metal", wsp_ggml_backend_reg_metal_init, wsp_ggml_backend_metal_buffer_type(), NULL);
+#endif
 }
-static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
-    struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
-    free(cpu_ctx->work_data);
-    free(cpu_ctx);
-    free(backend);
+void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
+    WSP_GGML_ASSERT(wsp_ggml_backend_registry_count < WSP_GGML_MAX_BACKENDS_REG);
+    int id = wsp_ggml_backend_registry_count;
+    wsp_ggml_backend_registry[id] = (struct wsp_ggml_backend_reg) {
+        /* .name                = */ {0},
+        /* .fn                  = */ init_fn,
+        /* .default_buffer_type = */ default_buffer_type,
+        /* .user_data           = */ user_data,
+    };
+    snprintf(wsp_ggml_backend_registry[id].name, sizeof(wsp_ggml_backend_registry[id].name), "%s", name);
+#ifndef NDEBUG
+    fprintf(stderr, "%s: registered backend %s\n", __func__, name);
+#endif
+    wsp_ggml_backend_registry_count++;
+}
+size_t wsp_ggml_backend_reg_get_count(void) {
+    wsp_ggml_backend_registry_init();
+    return wsp_ggml_backend_registry_count;
+}
+size_t wsp_ggml_backend_reg_find_by_name(const char * name) {
+    wsp_ggml_backend_registry_init();
+    for (size_t i = 0; i < wsp_ggml_backend_registry_count; i++) {
+        // TODO: case insensitive in a portable way
+        if (strcmp(wsp_ggml_backend_registry[i].name, name) == 0) {
+            return i;
+        }
+    }
+    return SIZE_MAX;
+}
+// init from backend:params string
+wsp_ggml_backend_t wsp_ggml_backend_reg_init_backend_from_str(const char * backend_str) {
+    wsp_ggml_backend_registry_init();
+    const char * params = strchr(backend_str, ':');
+    char backend_name[128];
+    if (params == NULL) {
+        strcpy(backend_name, backend_str);
+        params = "";
+    } else {
+        strncpy(backend_name, backend_str, params - backend_str);
+        backend_name[params - backend_str] = '\0';
+        params++;
+    }
+    size_t backend_i = wsp_ggml_backend_reg_find_by_name(backend_name);
+    if (backend_i == SIZE_MAX) {
+        fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
+        return NULL;
+    }
+    return wsp_ggml_backend_reg_init_backend(backend_i, params);
+}
+const char * wsp_ggml_backend_reg_get_name(size_t i) {
+    wsp_ggml_backend_registry_init();
+    WSP_GGML_ASSERT(i < wsp_ggml_backend_registry_count);
+    return wsp_ggml_backend_registry[i].name;
+}
+wsp_ggml_backend_t wsp_ggml_backend_reg_init_backend(size_t i, const char * params) {
+    wsp_ggml_backend_registry_init();
+    WSP_GGML_ASSERT(i < wsp_ggml_backend_registry_count);
+    return wsp_ggml_backend_registry[i].init_fn(params, wsp_ggml_backend_registry[i].user_data);
+}
+wsp_ggml_backend_buffer_type_t wsp_ggml_backend_reg_get_default_buffer_type(size_t i) {
+    wsp_ggml_backend_registry_init();
+    WSP_GGML_ASSERT(i < wsp_ggml_backend_registry_count);
+    return wsp_ggml_backend_registry[i].default_buffer_type;
+}
+wsp_ggml_backend_buffer_t wsp_ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
+    wsp_ggml_backend_registry_init();
+    WSP_GGML_ASSERT(i < wsp_ggml_backend_registry_count);
+    return wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_registry[i].default_buffer_type, size);
 }
+// backend CPU
 static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
 static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
     free(buffer->context);
-    UNUSED(buffer);
+    WSP_GGML_UNUSED(buffer);
+}
+static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
+    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    memcpy((char *)tensor->data + offset, data, size);
+    WSP_GGML_UNUSED(buffer);
+}
+static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
+    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    memcpy(data, (const char *)tensor->data + offset, size);
+    WSP_GGML_UNUSED(buffer);
+}
+static void wsp_ggml_backend_cpu_buffer_cpy_tensor_from(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
+    wsp_ggml_backend_tensor_get(src, dst->data, 0, wsp_ggml_nbytes(src));
+    WSP_GGML_UNUSED(buffer);
+}
+static void wsp_ggml_backend_cpu_buffer_cpy_tensor_to(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
+    wsp_ggml_backend_tensor_set(dst, src->data, 0, wsp_ggml_nbytes(src));
+    WSP_GGML_UNUSED(buffer);
 }
 static struct wsp_ggml_backend_buffer_i cpu_backend_buffer_i = {
-    /* .free_buffer    = */ wsp_ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base       = */ wsp_ggml_backend_cpu_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
-    /* .init_tensor    = */ NULL, // no initialization required
-    /* .free_tensor    = */ NULL, // no cleanup required
+    /* .free_buffer     = */ wsp_ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base        = */ wsp_ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .set_tensor      = */ wsp_ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ wsp_ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor_from = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_from,
+    /* .cpy_tensor_to   = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_to,
 };
 // for buffers from ptr, free is not called
 static struct wsp_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
-    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base       = */ wsp_ggml_backend_cpu_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
-    /* .init_tensor    = */ NULL,
-    /* .free_tensor    = */ NULL,
+    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base        = */ wsp_ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .set_tensor      = */ wsp_ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ wsp_ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor_from = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_from,
+    /* .cpy_tensor_to   = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_to,
 };
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_alloc_buffer(wsp_ggml_backend_t backend, size_t size) {
+static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: maybe use WSP_GGML_ALIGNED_MALLOC?
     WSP_GGML_ASSERT(data != NULL && "failed to allocate buffer");
-    return wsp_ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
+    return wsp_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
-static size_t wsp_ggml_backend_cpu_get_alignment(wsp_ggml_backend_t backend) {
+static size_t wsp_ggml_backend_cpu_buffer_type_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
     return TENSOR_ALIGNMENT;
-    UNUSED(backend);
-}
-static void wsp_ggml_backend_cpu_set_tensor_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
-    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    WSP_GGML_UNUSED(buft);
+}
-    memcpy((char *)tensor->data + offset, data, size);
+static bool wsp_ggml_backend_cpu_buffer_type_supports_backend(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend) {
+    return wsp_ggml_backend_is_cpu(backend);
-    UNUSED(backend);
+    WSP_GGML_UNUSED(buft);
 }
-static void wsp_ggml_backend_cpu_get_tensor_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
-    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    memcpy(data, (const char *)tensor->data + offset, size);
+wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_type(void) {
+    static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_buffer_type_cpu = {
+        /* .iface = */ {
+            /* .alloc_buffer     = */ wsp_ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to wsp_ggml_nbytes
+            /* .supports_backend = */ wsp_ggml_backend_cpu_buffer_type_supports_backend,
+        },
+        /* .context = */ NULL,
+    };
-    UNUSED(backend);
+    return &wsp_ggml_backend_buffer_type_cpu;
 }
-static void wsp_ggml_backend_cpu_synchronize(wsp_ggml_backend_t backend) {
-    UNUSED(backend);
-}
+struct wsp_ggml_backend_cpu_context {
+    int n_threads;
+    void * work_data;
+    size_t work_size;
+};
-static void wsp_ggml_backend_cpu_cpy_tensor_from(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
-    wsp_ggml_backend_tensor_get(src, dst->data, 0, wsp_ggml_nbytes(src));
+static const char * wsp_ggml_backend_cpu_name(wsp_ggml_backend_t backend) {
+    return "CPU";
-    UNUSED(backend);
+    WSP_GGML_UNUSED(backend);
 }
-static void wsp_ggml_backend_cpu_cpy_tensor_to(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
-    wsp_ggml_backend_tensor_set(dst, src->data, 0, wsp_ggml_nbytes(src));
+static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
+    struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
+    free(cpu_ctx->work_data);
+    free(cpu_ctx);
+    free(backend);
+}
+static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_get_default_buffer_type(wsp_ggml_backend_t backend) {
+    return wsp_ggml_backend_cpu_buffer_type();
-    UNUSED(backend);
+    WSP_GGML_UNUSED(backend);
 }
 struct wsp_ggml_backend_plan_cpu {
@@ -334,7 +520,7 @@ static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp
     free(cpu_plan->cplan.work_data);
     free(cpu_plan);
-    UNUSED(backend);
+    WSP_GGML_UNUSED(backend);
 }
 static void wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
@@ -342,7 +528,7 @@ static void wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend,
     wsp_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-    UNUSED(backend);
+    WSP_GGML_UNUSED(backend);
 }
 static void wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
@@ -363,25 +549,25 @@ static void wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struc
 static bool wsp_ggml_backend_cpu_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
     return true;
-    UNUSED(backend);
-    UNUSED(op);
+    WSP_GGML_UNUSED(backend);
+    WSP_GGML_UNUSED(op);
 }
 static struct wsp_ggml_backend_i cpu_backend_i = {
-    /* .get_name            = */ wsp_ggml_backend_cpu_name,
-    /* .free                = */ wsp_ggml_backend_cpu_free,
-    /* .alloc_buffer        = */ wsp_ggml_backend_cpu_alloc_buffer,
-    /* .get_alignment       = */ wsp_ggml_backend_cpu_get_alignment,
-    /* .set_tensor_async    = */ wsp_ggml_backend_cpu_set_tensor_async,
-    /* .get_tensor_async    = */ wsp_ggml_backend_cpu_get_tensor_async,
-    /* .synchronize         = */ wsp_ggml_backend_cpu_synchronize,
-    /* .cpy_tensor_from     = */ wsp_ggml_backend_cpu_cpy_tensor_from,
-    /* .cpy_tensor_to       = */ wsp_ggml_backend_cpu_cpy_tensor_to,
-    /* .graph_plan_create   = */ wsp_ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free     = */ wsp_ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_compute  = */ wsp_ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute       = */ wsp_ggml_backend_cpu_graph_compute,
-    /* .supports_op         = */ wsp_ggml_backend_cpu_supports_op,
+    /* .get_name                = */ wsp_ggml_backend_cpu_name,
+    /* .free                    = */ wsp_ggml_backend_cpu_free,
+    /* .get_default_buffer_type = */ wsp_ggml_backend_cpu_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ wsp_ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ wsp_ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_compute      = */ wsp_ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ wsp_ggml_backend_cpu_graph_compute,
+    /* .supports_op             = */ wsp_ggml_backend_cpu_supports_op,
 };
 wsp_ggml_backend_t wsp_ggml_backend_cpu_init(void) {
@@ -411,10 +597,18 @@ void wsp_ggml_backend_cpu_set_n_threads(wsp_ggml_backend_t backend_cpu, int n_th
     ctx->n_threads = n_threads;
 }
-wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(wsp_ggml_backend_t backend_cpu, void * ptr, size_t size) {
-    return wsp_ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
+wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+    return wsp_ggml_backend_buffer_init(wsp_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
+}
+static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
+    return wsp_ggml_backend_cpu_init();
+    WSP_GGML_UNUSED(params);
+    WSP_GGML_UNUSED(user_data);
 }
 // scheduler
 #define WSP_GGML_MAX_BACKENDS 4
@@ -427,7 +621,7 @@ struct wsp_ggml_backend_sched_split {
     int i_end;
     struct wsp_ggml_tensor * inputs[WSP_GGML_MAX_SPLIT_INPUTS];
     int n_inputs;
-    struct wsp_ggml_cgraph * graph;
+    struct wsp_ggml_cgraph graph;
 };
 struct wsp_ggml_backend_sched {
@@ -453,7 +647,7 @@ struct wsp_ggml_backend_sched {
     #else
     __attribute__((aligned(WSP_GGML_MEM_ALIGN)))
     #endif
-    char context_buffer[WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS*sizeof(struct wsp_ggml_tensor) + WSP_GGML_MAX_SPLITS*sizeof(struct wsp_ggml_cgraph)];
+    char context_buffer[WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS*sizeof(struct wsp_ggml_tensor) + sizeof(struct wsp_ggml_cgraph)];
 };
 #define hash_id(node) wsp_ggml_hash_find_or_insert(sched->hash_set, node)
@@ -482,23 +676,57 @@ static int sched_allocr_prio(wsp_ggml_backend_sched_t sched, wsp_ggml_tallocr_t
     return INT_MAX;
 }
+static wsp_ggml_backend_t get_buffer_backend(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_buffer_t buffer) {
+    if (buffer == NULL) {
+        return NULL;
+    }
+    // find highest prio backend that supports the buffer type
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (wsp_ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
+            return sched->backends[i];
+        }
+    }
+    WSP_GGML_ASSERT(false && "tensor buffer type not supported by any backend");
+}
+static wsp_ggml_backend_t get_allocr_backend(wsp_ggml_backend_sched_t sched, wsp_ggml_tallocr_t allocr) {
+    if (allocr == NULL) {
+        return NULL;
+    }
+    // find highest prio backend that supports the buffer type
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (sched->tallocs[i] == allocr) {
+            return sched->backends[i];
+        }
+    }
+    WSP_GGML_UNREACHABLE();
+}
+#if 0
+static char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*8 + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
+#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
+#define GET_CAUSE(node) causes[hash_id(node)]
+#else
+#define SET_CAUSE(node, ...)
+#define GET_CAUSE(node) ""
+#endif
 // returns the backend that should be used for the node based on the current locations
-char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*4 + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
 static wsp_ggml_backend_t sched_backend_from_cur(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
     // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
     // ie. kv cache updates
     // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
     // dst
-    wsp_ggml_backend_t cur_backend = wsp_ggml_get_backend(node);
+    wsp_ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
     if (cur_backend != NULL) {
-        sprintf(causes[hash_id(node)], "1.dst");
+        SET_CAUSE(node, "1.dst");
         return cur_backend;
     }
     // view_src
-    if (node->view_src != NULL && wsp_ggml_get_backend(node->view_src) != NULL) {
-        sprintf(causes[hash_id(node)], "1.vsrc");
-        return wsp_ggml_get_backend(node->view_src);
+    if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
+        SET_CAUSE(node, "1.vsrc");
+        return get_buffer_backend(sched, node->view_src->buffer);
     }
     // src
@@ -510,7 +738,7 @@ static wsp_ggml_backend_t sched_backend_from_cur(wsp_ggml_backend_sched_t sched,
         if (src == NULL) {
             break;
         }
-        wsp_ggml_backend_t src_backend = wsp_ggml_get_backend(src);
+        wsp_ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
         if (src_backend != NULL) {
             int src_prio = sched_backend_prio(sched, src_backend);
             size_t src_size = wsp_ggml_nbytes(src);
@@ -518,7 +746,7 @@ static wsp_ggml_backend_t sched_backend_from_cur(wsp_ggml_backend_sched_t sched,
                 cur_prio = src_prio;
                 cur_size = src_size;
                 cur_backend = src_backend;
-                sprintf(causes[hash_id(node)], "1.src%d", i);
+                SET_CAUSE(node, "1.src%d", i);
             }
         }
     }
@@ -539,10 +767,12 @@ static void sched_print_assignments(wsp_ggml_backend_sched_t sched, struct wsp_g
     int cur_split = 0;
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
-            wsp_ggml_backend_t split_backend = wsp_ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend;
-            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, wsp_ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs);
+            wsp_ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
+            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, wsp_ggml_backend_name(split_backend),
+                sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
-                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(wsp_ggml_nbytes(sched->splits[cur_split].inputs[j])));
+                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
+                    fmt_size(wsp_ggml_nbytes(sched->splits[cur_split].inputs[j])));
             }
             fprintf(stderr, "\n");
             cur_split++;
@@ -552,16 +782,18 @@ static void sched_print_assignments(wsp_ggml_backend_sched_t sched, struct wsp_g
             continue;
         }
         wsp_ggml_tallocr_t node_allocr = node_allocr(node);
-        wsp_ggml_backend_t node_backend = node_allocr ? wsp_ggml_tallocr_get_buffer(node_allocr)->backend : NULL;
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name, fmt_size(wsp_ggml_nbytes(node)), node_allocr ? wsp_ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]);
+        wsp_ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
+        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
+            fmt_size(wsp_ggml_nbytes(node)), node_allocr ? wsp_ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
             struct wsp_ggml_tensor * src = node->src[j];
             if (src == NULL) {
                 break;
             }
             wsp_ggml_tallocr_t src_allocr = node_allocr(src);
-            wsp_ggml_backend_t src_backend = src_allocr ? wsp_ggml_tallocr_get_buffer(src_allocr)->backend : NULL;
-            fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]);
+            wsp_ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
+            fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
+                fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
         fprintf(stderr, "\n");
     }
@@ -587,9 +819,9 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
     sched->n_splits = 0;
     struct wsp_ggml_init_params params = {
-        /*.mem_size =   */ sizeof(sched->context_buffer),
-        /*.mem_buffer = */ sched->context_buffer,
-        /*.no_alloc =   */ true
+        /* .mem_size =   */ sizeof(sched->context_buffer),
+        /* .mem_buffer = */ sched->context_buffer,
+        /* .no_alloc =   */ true
     };
     if (sched->ctx != NULL) {
@@ -605,9 +837,9 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
             // do not overwrite user assignments
             continue;
         }
-        wsp_ggml_backend_t leaf_backend = wsp_ggml_get_backend(leaf);
+        wsp_ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
         if (leaf_backend == NULL && leaf->view_src != NULL) {
-            leaf_backend = wsp_ggml_get_backend(leaf->view_src);
+            leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
         }
         if (leaf_backend != NULL) {
             node_allocr(leaf) = wsp_ggml_backend_sched_get_tallocr(sched, leaf_backend);
@@ -649,7 +881,7 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
                         cur_prio = src_prio;
                         cur_size = src_size;
                         node_allocr = src_allocr;
-                        sprintf(causes[hash_id(node)], "2.src%d", j);
+                        SET_CAUSE(node, "2.src%d", j);
                     }
                 }
             }
@@ -733,7 +965,7 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
                     struct wsp_ggml_tensor * tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
                     sched->node_copies[id][cur_backend_id] = tensor_copy;
                     node_allocr(tensor_copy) = cur_allocr;
-                    wsp_ggml_backend_t backend = wsp_ggml_tallocr_get_buffer(cur_allocr)->backend;
+                    wsp_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
                     wsp_ggml_format_name(tensor_copy, "%s#%s", wsp_ggml_backend_name(backend), src->name);
                 }
                 node->src[j] = sched->node_copies[id][cur_backend_id];
@@ -761,8 +993,8 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
             wsp_ggml_tallocr_t src_allocr = node_allocr(src);
             if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
                 fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
-                    node->name, node_allocr ? wsp_ggml_backend_name(wsp_ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL",
-                    j, src->name, src_allocr ? wsp_ggml_backend_name(wsp_ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL");
+                    node->name, node_allocr ? wsp_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
+                    j, src->name, src_allocr ? wsp_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
             }
         }
     }
@@ -773,7 +1005,7 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
     struct wsp_ggml_cgraph * graph_copy = wsp_ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*WSP_GGML_MAX_SPLIT_INPUTS, false);
     for (int i = 0; i < sched->n_splits; i++) {
         struct wsp_ggml_backend_sched_split * split = &sched->splits[i];
-        split->graph = wsp_ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end);
+        split->graph = wsp_ggml_graph_view(graph, split->i_start, split->i_end);
         // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
         for (int j = 0; j < split->n_inputs; j++) {
@@ -806,31 +1038,29 @@ static void sched_compute_splits(wsp_ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_splits; i++) {
         struct wsp_ggml_backend_sched_split * split = &splits[i];
-        wsp_ggml_backend_t split_backend = wsp_ggml_tallocr_get_buffer(split->tallocr)->backend;
+        wsp_ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
         int split_backend_id = sched_backend_prio(sched, split_backend);
         // copy the input tensors to the split backend
         uint64_t copy_start_us = wsp_ggml_time_us();
         for (int j = 0; j < split->n_inputs; j++) {
-            struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)];
-            if (split->inputs[j]->buffer == NULL) {
-                if (split->inputs[j]->view_src == NULL) {
-                    fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name);
+            struct wsp_ggml_tensor * input = split->inputs[j];
+            struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
+            if (input->buffer == NULL) {
+                if (input->view_src == NULL) {
+                    fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
                     exit(1);
                 }
-                struct wsp_ggml_tensor * view = split->inputs[j];
-                view->backend = view->view_src->backend;
-                view->buffer  = view->view_src->buffer;
-                view->data    = (char *)view->view_src->data + view->view_offs;
-                wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_sched_get_buffer(sched, view->buffer->backend), view);
+                // FIXME: may need to use the sched buffer instead
+                wsp_ggml_backend_view_init(input->view_src->buffer, input);
             }
             if (input_cpy->buffer == NULL) {
                 fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
                 exit(1);
             }
-            WSP_GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend);
-            WSP_GGML_ASSERT(input_cpy->buffer->backend == split_backend);
-            wsp_ggml_backend_tensor_copy(split->inputs[j], input_cpy);
+            //WSP_GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
+            //WSP_GGML_ASSERT(input_cpy->buffer->backend == split_backend);
+            wsp_ggml_backend_tensor_copy(input, input_cpy);
         }
         // wsp_ggml_backend_synchronize(split_backend);
         int64_t copy_end_us = wsp_ggml_time_us();
@@ -843,7 +1073,7 @@ static void sched_compute_splits(wsp_ggml_backend_sched_t sched) {
 #endif
         uint64_t compute_start_us = wsp_ggml_time_us();
-        wsp_ggml_backend_graph_compute(split_backend, split->graph);
+        wsp_ggml_backend_graph_compute(split_backend, &split->graph);
         // wsp_ggml_backend_synchronize(split_backend);
         uint64_t compute_end_us = wsp_ggml_time_us();
         compute_us[split_backend_id] += compute_end_us - compute_start_us;
@@ -872,8 +1102,6 @@ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(wsp_ggml_backend_t * backend
     struct wsp_ggml_backend_sched * sched = malloc(sizeof(struct wsp_ggml_backend_sched));
     memset(sched, 0, sizeof(struct wsp_ggml_backend_sched));
-    fprintf(stderr, "wsp_ggml_backend_sched size: %lu KB\n", sizeof(struct wsp_ggml_backend_sched)/1024);
     sched->n_backends = n_backends;
     for (int i = 0; i < n_backends; i++) {
         sched->backends[i] = backends[i];
@@ -948,3 +1176,182 @@ void wsp_ggml_backend_sched_set_node_backend(wsp_ggml_backend_sched_t sched, str
     WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     node_allocr(node) = sched->tallocs[backend_index];
 }
+// utils
+void wsp_ggml_backend_view_init(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor->buffer == NULL);
+    WSP_GGML_ASSERT(tensor->data == NULL);
+    WSP_GGML_ASSERT(tensor->view_src != NULL);
+    WSP_GGML_ASSERT(tensor->view_src->buffer != NULL);
+    WSP_GGML_ASSERT(tensor->view_src->data != NULL);
+    tensor->buffer = buffer;
+    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
+    tensor->backend = tensor->view_src->backend;
+    wsp_ggml_backend_buffer_init_tensor(buffer, tensor);
+}
+void wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr) {
+    WSP_GGML_ASSERT(tensor->buffer == NULL);
+    WSP_GGML_ASSERT(tensor->data == NULL);
+    WSP_GGML_ASSERT(tensor->view_src == NULL);
+    WSP_GGML_ASSERT(addr >= wsp_ggml_backend_buffer_get_base(buffer));
+    WSP_GGML_ASSERT((char *)addr + wsp_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
+                (char *)wsp_ggml_backend_buffer_get_base(buffer) + wsp_ggml_backend_buffer_get_size(buffer));
+    tensor->buffer = buffer;
+    tensor->data = addr;
+    wsp_ggml_backend_buffer_init_tensor(buffer, tensor);
+}
+static struct wsp_ggml_tensor * graph_dup_tensor(struct wsp_ggml_hash_set hash_set, struct wsp_ggml_tensor ** node_copies,
+    struct wsp_ggml_context * ctx_allocated, struct wsp_ggml_context * ctx_unallocated, struct wsp_ggml_tensor * src) {
+    WSP_GGML_ASSERT(src != NULL);
+    WSP_GGML_ASSERT(src->data && "graph must be allocated");
+    size_t id = wsp_ggml_hash_insert(hash_set, src);
+    if (id == WSP_GGML_HASHTABLE_ALREADY_EXISTS) {
+        return node_copies[wsp_ggml_hash_find(hash_set, src)];
+    }
+    struct wsp_ggml_tensor * dst = wsp_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
+    if (src->view_src != NULL) {
+        dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
+        dst->view_offs = src->view_offs;
+    }
+    dst->op = src->op;
+    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
+    wsp_ggml_set_name(dst, src->name);
+    // copy src
+    for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+        struct wsp_ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            break;
+        }
+        dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
+    }
+    node_copies[id] = dst;
+    return dst;
+}
+static void graph_init_tensor(struct wsp_ggml_hash_set hash_set, struct wsp_ggml_tensor ** node_copies, bool * node_init, struct wsp_ggml_tensor * src) {
+    size_t id = wsp_ggml_hash_find(hash_set, src);
+    if (node_init[id]) {
+        return;
+    }
+    node_init[id] = true;
+    struct wsp_ggml_tensor * dst = node_copies[id];
+    if (dst->view_src != NULL) {
+        wsp_ggml_backend_view_init(dst->view_src->buffer, dst);
+    }
+    else {
+        wsp_ggml_backend_tensor_copy(src, dst);
+    }
+    // init src
+    for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+        struct wsp_ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            break;
+        }
+        graph_init_tensor(hash_set, node_copies, node_init, s);
+    }
+}
+struct wsp_ggml_backend_graph_copy wsp_ggml_backend_graph_copy(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * graph) {
+    struct wsp_ggml_hash_set hash_set = {
+        /* .size = */ graph->visited_hash_table.size,
+        /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
+    };
+    struct wsp_ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
+    bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
+    struct wsp_ggml_init_params params = {
+        /* .mem_size   = */ wsp_ggml_tensor_overhead()*hash_set.size + wsp_ggml_graph_overhead_custom(graph->size, false),
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true
+    };
+    struct wsp_ggml_context * ctx_allocated = wsp_ggml_init(params);
+    struct wsp_ggml_context * ctx_unallocated = wsp_ggml_init(params);
+    // dup nodes
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct wsp_ggml_tensor * node = graph->nodes[i];
+        graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
+    }
+    // allocate nodes
+    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+    //printf("copy buffer size: %zu MB\n", wsp_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
+    // copy data and init views
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct wsp_ggml_tensor * node = graph->nodes[i];
+        graph_init_tensor(hash_set, node_copies, node_init, node);
+    }
+    // build graph copy
+    struct wsp_ggml_cgraph * graph_copy = wsp_ggml_new_graph_custom(ctx_allocated, graph->size, false);
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct wsp_ggml_tensor * node = graph->nodes[i];
+        struct wsp_ggml_tensor * node_copy = node_copies[wsp_ggml_hash_find(hash_set, node)];
+        graph_copy->nodes[i] = node_copy;
+    }
+    graph_copy->n_nodes = graph->n_nodes;
+    free(hash_set.keys);
+    free(node_copies);
+    free(node_init);
+    return (struct wsp_ggml_backend_graph_copy) {
+        /* .buffer           = */ buffer,
+        /* .ctx_allocated    = */ ctx_allocated,
+        /* .ctx_unallocated  = */ ctx_unallocated,
+        /* .graph            = */ graph_copy,
+    };
+}
+void wsp_ggml_backend_graph_copy_free(struct wsp_ggml_backend_graph_copy copy) {
+    wsp_ggml_backend_buffer_free(copy.buffer);
+    wsp_ggml_free(copy.ctx_allocated);
+    wsp_ggml_free(copy.ctx_unallocated);
+}
+void wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data) {
+    struct wsp_ggml_backend_graph_copy copy = wsp_ggml_backend_graph_copy(backend2, graph);
+    struct wsp_ggml_cgraph * g1 = graph;
+    struct wsp_ggml_cgraph * g2 = copy.graph;
+    assert(g1->n_nodes == g2->n_nodes);
+    for (int i = 0; i < g1->n_nodes; i++) {
+        //printf("eval %d/%d\n", i, g1->n_nodes);
+        struct wsp_ggml_tensor * t1 = g1->nodes[i];
+        struct wsp_ggml_tensor * t2 = g2->nodes[i];
+        assert(t1->op == t2->op && wsp_ggml_are_same_layout(t1, t2));
+        struct wsp_ggml_cgraph g1v = wsp_ggml_graph_view(g1, i, i + 1);
+        struct wsp_ggml_cgraph g2v = wsp_ggml_graph_view(g2, i, i + 1);
+        wsp_ggml_backend_graph_compute(backend1, &g1v);
+        wsp_ggml_backend_graph_compute(backend2, &g2v);
+        if (wsp_ggml_is_view_op(t1->op)) {
+            continue;
+        }
+        // compare results, calculate rms etc
+        if (!callback(i, t1, t2, user_data)) {
+            break;
+        }
+    }
+    wsp_ggml_backend_graph_copy_free(copy);
+}