npm - whisper.rn - Versions diffs - 0.4.0-rc.6 → 0.4.0-rc.8 - Mend

whisper.rn 0.4.0-rc.6 → 0.4.0-rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/android/src/main/java/com/rnwhisper/RNWhisper.java +5 -5
package/cpp/coreml/whisper-encoder.mm +1 -1
package/cpp/ggml-alloc.c +41 -11
package/cpp/ggml-alloc.h +3 -1
package/cpp/ggml-backend-impl.h +38 -34
package/cpp/ggml-backend.c +630 -269
package/cpp/ggml-backend.h +58 -30
package/cpp/ggml-impl.h +3 -0
package/cpp/ggml-metal-whisper.metal +1253 -341
package/cpp/ggml-metal.h +6 -54
package/cpp/ggml-metal.m +2004 -1987
package/cpp/ggml-quants.c +2230 -421
package/cpp/ggml-quants.h +39 -1
package/cpp/ggml.c +735 -265
package/cpp/ggml.h +94 -43
package/cpp/rn-whisper.cpp +1 -0
package/cpp/whisper.cpp +118 -86
package/ios/RNWhisperContext.mm +4 -2
package/lib/commonjs/version.json +1 -1
package/lib/module/version.json +1 -1
package/package.json +1 -1
package/src/version.json +1 -1

package/cpp/ggml-backend.c CHANGED Viewed

@@ -15,7 +15,11 @@
 // backend buffer type
-wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
+const char * wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name(buft);
+}
+WSP_GGML_CALL wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
     return buft->iface.alloc_buffer(buft, size);
 }
@@ -23,7 +27,7 @@ size_t wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type_t buft)
     return buft->iface.get_alignment(buft);
 }
-size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor) {
+WSP_GGML_CALL size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to wsp_ggml_nbytes
     if (buft->iface.get_alloc_size) {
         return buft->iface.get_alloc_size(buft, tensor);
@@ -35,9 +39,16 @@ bool wsp_ggml_backend_buft_supports_backend(wsp_ggml_backend_buffer_type_t buft,
     return buft->iface.supports_backend(buft, backend);
 }
+bool wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_type_t buft) {
+    if (buft->iface.is_host) {
+        return buft->iface.is_host(buft);
+    }
+    return false;
+}
 // backend buffer
-wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
+WSP_GGML_CALL wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
                wsp_ggml_backend_buffer_type_t      buft,
         struct wsp_ggml_backend_buffer_i           iface,
                wsp_ggml_backend_buffer_context_t   context,
@@ -51,11 +62,16 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
         /* .buft      = */ buft,
         /* .context   = */ context,
         /* .size      = */ size,
+        /* .usage     = */ WSP_GGML_BACKEND_BUFFER_USAGE_ANY
     };
     return buffer;
 }
+const char * wsp_ggml_backend_buffer_name(wsp_ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name(buffer);
+}
 void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
     if (buffer == NULL) {
         return;
@@ -79,7 +95,7 @@ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
     return base;
 }
-void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
+WSP_GGML_CALL void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
     // init_tensor is optional
     if (buffer->iface.init_tensor) {
         buffer->iface.init_tensor(buffer, tensor);
@@ -87,17 +103,43 @@ void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struc
 }
 size_t wsp_ggml_backend_buffer_get_alignment (wsp_ggml_backend_buffer_t buffer) {
-    return wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type(buffer));
+    return wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_get_type(buffer));
 }
 size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
-    return wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type(buffer), tensor);
+    return wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_get_type(buffer), tensor);
+}
+void wsp_ggml_backend_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
+    buffer->iface.clear(buffer, value);
 }
-wsp_ggml_backend_buffer_type_t wsp_ggml_backend_buffer_type(wsp_ggml_backend_buffer_t buffer) {
+bool wsp_ggml_backend_buffer_is_host(wsp_ggml_backend_buffer_t buffer) {
+    return wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_get_type(buffer));
+}
+void wsp_ggml_backend_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
+    buffer->usage = usage;
+}
+wsp_ggml_backend_buffer_type_t wsp_ggml_backend_buffer_get_type(wsp_ggml_backend_buffer_t buffer) {
     return buffer->buft;
 }
+void wsp_ggml_backend_buffer_reset(wsp_ggml_backend_buffer_t buffer) {
+    if (buffer->iface.reset) {
+        buffer->iface.reset(buffer);
+    }
+}
+bool wsp_ggml_backend_buffer_copy_tensor(const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
+    wsp_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
+    if (dst_buf->iface.cpy_tensor) {
+        return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
+    }
+    return false;
+}
 // backend
 const char * wsp_ggml_backend_name(wsp_ggml_backend_t backend) {
@@ -131,30 +173,42 @@ void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_gg
     WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
-    backend->iface.set_tensor_async(backend, tensor, data, offset, size);
+    if (backend->iface.set_tensor_async == NULL) {
+        wsp_ggml_backend_tensor_set(tensor, data, offset, size);
+    } else {
+        backend->iface.set_tensor_async(backend, tensor, data, offset, size);
+    }
 }
 void wsp_ggml_backend_tensor_get_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
-    backend->iface.get_tensor_async(backend, tensor, data, offset, size);
+    if (backend->iface.get_tensor_async == NULL) {
+        wsp_ggml_backend_tensor_get(tensor, data, offset, size);
+    } else {
+        backend->iface.get_tensor_async(backend, tensor, data, offset, size);
+    }
 }
-void wsp_ggml_backend_tensor_set(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+WSP_GGML_CALL void wsp_ggml_backend_tensor_set(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    WSP_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
+    WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
     WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
-    tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
+    tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
 }
-void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+WSP_GGML_CALL void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     WSP_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
     WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
-    tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
+    tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
 }
 void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
@@ -175,16 +229,10 @@ void wsp_ggml_backend_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backe
 void wsp_ggml_backend_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
     backend->iface.graph_plan_compute(backend, plan);
-    // TODO: optional sync
-    wsp_ggml_backend_synchronize(backend);
 }
-void wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
-    // TODO: optional sync
-    wsp_ggml_backend_synchronize(backend);
+bool wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
+    return backend->iface.graph_compute(backend, cgraph);
 }
 bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
@@ -209,28 +257,20 @@ static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const str
 }
 void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
-    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
-    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
     WSP_GGML_ASSERT(wsp_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-    // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, wsp_ggml_backend_name(src->backend), wsp_ggml_backend_name(dst->backend), wsp_ggml_nbytes(src));
     if (src == dst) {
         return;
     }
-    // TODO: allow backends to support copy to/from same backend
-    if (dst->buffer->iface.cpy_tensor_from != NULL) {
-        dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
-    } else if (src->buffer->iface.cpy_tensor_to != NULL) {
-        src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
-    } else {
-        // shouldn't be hit when copying from/to CPU
-        #ifndef NDEBUG
-        fprintf(stderr, "wsp_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
-                        "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
-        #endif
+    if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
+        wsp_ggml_backend_tensor_set(dst, src->data, 0, wsp_ggml_nbytes(src));
+    } else if (wsp_ggml_backend_buffer_is_host(dst->buffer)) {
+        wsp_ggml_backend_tensor_get(src, dst->data, 0, wsp_ggml_nbytes(src));
+    } else if (!wsp_ggml_backend_buffer_copy_tensor(src, dst)) {
+#ifndef NDEBUG
+        fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, wsp_ggml_backend_buffer_name(src->buffer), wsp_ggml_backend_buffer_name(dst->buffer));
+#endif
         size_t nbytes = wsp_ggml_nbytes(src);
         void * data = malloc(nbytes);
         wsp_ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -239,6 +279,31 @@ void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_
     }
 }
+void wsp_ggml_backend_tensor_copy_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
+    WSP_GGML_ASSERT(wsp_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+    if (src == dst) {
+        return;
+    }
+    if (wsp_ggml_backend_buft_supports_backend(src->buffer->buft, backend) && wsp_ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
+        if (backend->iface.cpy_tensor_async != NULL) {
+            if (backend->iface.cpy_tensor_async(backend, src, dst)) {
+                return;
+            }
+        }
+    }
+    size_t nbytes = wsp_ggml_nbytes(src);
+    if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
+        wsp_ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
+    }
+    else {
+        wsp_ggml_backend_tensor_copy(src, dst);
+    }
+}
 // backend registry
 #define WSP_GGML_MAX_BACKENDS_REG 16
@@ -253,9 +318,9 @@ struct wsp_ggml_backend_reg {
 static struct wsp_ggml_backend_reg wsp_ggml_backend_registry[WSP_GGML_MAX_BACKENDS_REG];
 static size_t wsp_ggml_backend_registry_count = 0;
-static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, void * user_data);
+WSP_GGML_CALL static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, void * user_data);
-static void wsp_ggml_backend_registry_init(void) {
+WSP_GGML_CALL static void wsp_ggml_backend_registry_init(void) {
     static bool initialized = false;
     if (initialized) {
@@ -268,21 +333,21 @@ static void wsp_ggml_backend_registry_init(void) {
     // add forward decls here to avoid including the backend headers
 #ifdef WSP_GGML_USE_CUBLAS
-    extern void wsp_ggml_backend_cuda_reg_devices(void);
+    extern WSP_GGML_CALL void wsp_ggml_backend_cuda_reg_devices(void);
     wsp_ggml_backend_cuda_reg_devices();
 #endif
 #ifdef WSP_GGML_USE_METAL
-    extern wsp_ggml_backend_t wsp_ggml_backend_reg_metal_init(const char * params, void * user_data);
-    extern wsp_ggml_backend_buffer_type_t wsp_ggml_backend_metal_buffer_type(void);
+    extern WSP_GGML_CALL wsp_ggml_backend_t wsp_ggml_backend_reg_metal_init(const char * params, void * user_data);
+    extern WSP_GGML_CALL wsp_ggml_backend_buffer_type_t wsp_ggml_backend_metal_buffer_type(void);
     wsp_ggml_backend_register("Metal", wsp_ggml_backend_reg_metal_init, wsp_ggml_backend_metal_buffer_type(), NULL);
 #endif
 }
-void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
+WSP_GGML_CALL void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
     WSP_GGML_ASSERT(wsp_ggml_backend_registry_count < WSP_GGML_MAX_BACKENDS_REG);
-    int id = wsp_ggml_backend_registry_count;
+    size_t id = wsp_ggml_backend_registry_count;
     wsp_ggml_backend_registry[id] = (struct wsp_ggml_backend_reg) {
         /* .name                = */ {0},
@@ -315,6 +380,8 @@ size_t wsp_ggml_backend_reg_find_by_name(const char * name) {
             return i;
         }
     }
+    // not found
     return SIZE_MAX;
 }
@@ -325,15 +392,15 @@ wsp_ggml_backend_t wsp_ggml_backend_reg_init_backend_from_str(const char * backe
     const char * params = strchr(backend_str, ':');
     char backend_name[128];
     if (params == NULL) {
-        strcpy(backend_name, backend_str);
+        snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
         params = "";
     } else {
-        strncpy(backend_name, backend_str, params - backend_str);
-        backend_name[params - backend_str] = '\0';
+        snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
         params++;
     }
     size_t backend_i = wsp_ggml_backend_reg_find_by_name(backend_name);
     if (backend_i == SIZE_MAX) {
         fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
         return NULL;
@@ -372,69 +439,80 @@ wsp_ggml_backend_buffer_t wsp_ggml_backend_reg_alloc_buffer(size_t i, size_t siz
 // backend CPU
-static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
+WSP_GGML_CALL static const char * wsp_ggml_backend_cpu_buffer_name(wsp_ggml_backend_buffer_t buffer) {
+    return "CPU";
+    WSP_GGML_UNUSED(buffer);
+}
+WSP_GGML_CALL static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
-static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
     free(buffer->context);
-    WSP_GGML_UNUSED(buffer);
 }
-static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
-    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
     WSP_GGML_UNUSED(buffer);
 }
-static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
-    WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     memcpy(data, (const char *)tensor->data + offset, size);
     WSP_GGML_UNUSED(buffer);
 }
-static void wsp_ggml_backend_cpu_buffer_cpy_tensor_from(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
-    wsp_ggml_backend_tensor_get(src, dst->data, 0, wsp_ggml_nbytes(src));
+WSP_GGML_CALL static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
+    if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, wsp_ggml_nbytes(src));
+        return true;
+    }
+    return false;
     WSP_GGML_UNUSED(buffer);
 }
-static void wsp_ggml_backend_cpu_buffer_cpy_tensor_to(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
-    wsp_ggml_backend_tensor_set(dst, src->data, 0, wsp_ggml_nbytes(src));
-    WSP_GGML_UNUSED(buffer);
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
 }
 static struct wsp_ggml_backend_buffer_i cpu_backend_buffer_i = {
+    /* .get_name        = */ wsp_ggml_backend_cpu_buffer_name,
     /* .free_buffer     = */ wsp_ggml_backend_cpu_buffer_free_buffer,
     /* .get_base        = */ wsp_ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
     /* .set_tensor      = */ wsp_ggml_backend_cpu_buffer_set_tensor,
     /* .get_tensor      = */ wsp_ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor_from = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_from,
-    /* .cpy_tensor_to   = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_to,
+    /* .cpy_tensor      = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ wsp_ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
 };
 // for buffers from ptr, free is not called
 static struct wsp_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
+    /* .get_name        = */ wsp_ggml_backend_cpu_buffer_name,
     /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
     /* .get_base        = */ wsp_ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
     /* .set_tensor      = */ wsp_ggml_backend_cpu_buffer_set_tensor,
     /* .get_tensor      = */ wsp_ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor_from = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_from,
-    /* .cpy_tensor_to   = */ wsp_ggml_backend_cpu_buffer_cpy_tensor_to,
+    /* .cpy_tensor      = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ wsp_ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
 };
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
+WSP_GGML_CALL static const char * wsp_ggml_backend_cpu_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
+    return "CPU";
+    WSP_GGML_UNUSED(buft);
+}
+WSP_GGML_CALL static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: maybe use WSP_GGML_ALIGNED_MALLOC?
@@ -443,31 +521,95 @@ static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(w
     return wsp_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
-static size_t wsp_ggml_backend_cpu_buffer_type_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
+WSP_GGML_CALL static size_t wsp_ggml_backend_cpu_buffer_type_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
     return TENSOR_ALIGNMENT;
     WSP_GGML_UNUSED(buft);
 }
-static bool wsp_ggml_backend_cpu_buffer_type_supports_backend(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend) {
+WSP_GGML_CALL static bool wsp_ggml_backend_cpu_buffer_type_supports_backend(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend) {
     return wsp_ggml_backend_is_cpu(backend);
     WSP_GGML_UNUSED(buft);
 }
-wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_type(void) {
-    static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_buffer_type_cpu = {
+WSP_GGML_CALL static bool wsp_ggml_backend_cpu_buffer_type_is_host(wsp_ggml_backend_buffer_type_t buft) {
+    return true;
+    WSP_GGML_UNUSED(buft);
+}
+WSP_GGML_CALL wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_type(void) {
+    static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type = {
         /* .iface = */ {
+            /* .get_name         = */ wsp_ggml_backend_cpu_buffer_type_get_name,
             /* .alloc_buffer     = */ wsp_ggml_backend_cpu_buffer_type_alloc_buffer,
             /* .get_alignment    = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
             /* .get_alloc_size   = */ NULL, // defaults to wsp_ggml_nbytes
             /* .supports_backend = */ wsp_ggml_backend_cpu_buffer_type_supports_backend,
+            /* .is_host          = */ wsp_ggml_backend_cpu_buffer_type_is_host,
         },
         /* .context = */ NULL,
     };
-    return &wsp_ggml_backend_buffer_type_cpu;
+    return &wsp_ggml_backend_cpu_buffer_type;
+}
+#ifdef WSP_GGML_USE_CPU_HBM
+// buffer type HBM
+#include <hbwmalloc.h>
+WSP_GGML_CALL static const char * wsp_ggml_backend_cpu_hbm_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+    WSP_GGML_UNUSED(buft);
+}
+WSP_GGML_CALL static const char * wsp_ggml_backend_cpu_hbm_buffer_get_name(wsp_ggml_backend_buffer_t buf) {
+    return "CPU_HBM";
+    WSP_GGML_UNUSED(buf);
+}
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_hbm_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+WSP_GGML_CALL static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
+    //void * ptr = hbw_malloc(size);
+    void * ptr;
+    int result = hbw_posix_memalign(&ptr, wsp_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.get_name = wsp_ggml_backend_cpu_hbm_buffer_get_name;
+    buffer->iface.free_buffer = wsp_ggml_backend_cpu_hbm_buffer_free_buffer;
+    return buffer;
+}
+wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+            /* .get_name         = */ wsp_ggml_backend_cpu_hbm_buffer_type_get_name,
+            /* .alloc_buffer     = */ wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to wsp_ggml_nbytes
+            /* .supports_backend = */ wsp_ggml_backend_cpu_buffer_type_supports_backend,
+            /* .is_host          = */ wsp_ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .context  = */ NULL,
+    };
+    return &wsp_ggml_backend_cpu_buffer_type_hbm;
 }
+#endif
 struct wsp_ggml_backend_cpu_context {
     int n_threads;
@@ -475,20 +617,20 @@ struct wsp_ggml_backend_cpu_context {
     size_t work_size;
 };
-static const char * wsp_ggml_backend_cpu_name(wsp_ggml_backend_t backend) {
+WSP_GGML_CALL static const char * wsp_ggml_backend_cpu_name(wsp_ggml_backend_t backend) {
     return "CPU";
     WSP_GGML_UNUSED(backend);
 }
-static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
     struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
     free(cpu_ctx->work_data);
     free(cpu_ctx);
     free(backend);
 }
-static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_get_default_buffer_type(wsp_ggml_backend_t backend) {
+WSP_GGML_CALL static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_get_default_buffer_type(wsp_ggml_backend_t backend) {
     return wsp_ggml_backend_cpu_buffer_type();
     WSP_GGML_UNUSED(backend);
@@ -499,13 +641,13 @@ struct wsp_ggml_backend_plan_cpu {
     struct wsp_ggml_cgraph cgraph;
 };
-static wsp_ggml_backend_graph_plan_t wsp_ggml_backend_cpu_graph_plan_create(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
+WSP_GGML_CALL static wsp_ggml_backend_graph_plan_t wsp_ggml_backend_cpu_graph_plan_create(wsp_ggml_backend_t backend, const struct wsp_ggml_cgraph * cgraph) {
     struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
     struct wsp_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct wsp_ggml_backend_plan_cpu));
     cpu_plan->cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-    cpu_plan->cgraph = *cgraph;
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
     if (cpu_plan->cplan.work_size > 0) {
         cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -514,7 +656,7 @@ static wsp_ggml_backend_graph_plan_t wsp_ggml_backend_cpu_graph_plan_create(wsp_
     return cpu_plan;
 }
-static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
     struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
     free(cpu_plan->cplan.work_data);
@@ -523,7 +665,7 @@ static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp
     WSP_GGML_UNUSED(backend);
 }
-static void wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
+WSP_GGML_CALL static void wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
     struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
     wsp_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -531,7 +673,7 @@ static void wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend,
     WSP_GGML_UNUSED(backend);
 }
-static void wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
+WSP_GGML_CALL static bool wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
     struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
     struct wsp_ggml_cplan cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -545,13 +687,20 @@ static void wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struc
     cplan.work_data = cpu_ctx->work_data;
     wsp_ggml_graph_compute(cgraph, &cplan);
+    return true;
 }
-static bool wsp_ggml_backend_cpu_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
-    return true;
+WSP_GGML_CALL static bool wsp_ggml_backend_cpu_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
+    switch (op->op) {
+        case WSP_GGML_OP_CPY:
+            return op->type != WSP_GGML_TYPE_IQ2_XXS && op->type != WSP_GGML_TYPE_IQ2_XS; // missing type_traits.from_float
+        case WSP_GGML_OP_MUL_MAT:
+            return op->src[1]->type == WSP_GGML_TYPE_F32 || op->src[1]->type == wsp_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        default:
+            return true;
+    }
     WSP_GGML_UNUSED(backend);
-    WSP_GGML_UNUSED(op);
 }
 static struct wsp_ggml_backend_i cpu_backend_i = {
@@ -560,8 +709,7 @@ static struct wsp_ggml_backend_i cpu_backend_i = {
     /* .get_default_buffer_type = */ wsp_ggml_backend_cpu_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ wsp_ggml_backend_cpu_graph_plan_create,
     /* .graph_plan_free         = */ wsp_ggml_backend_cpu_graph_plan_free,
@@ -586,8 +734,8 @@ wsp_ggml_backend_t wsp_ggml_backend_cpu_init(void) {
     return cpu_backend;
 }
-bool wsp_ggml_backend_is_cpu(wsp_ggml_backend_t backend) {
-    return backend->iface.get_name == wsp_ggml_backend_cpu_name;
+WSP_GGML_CALL bool wsp_ggml_backend_is_cpu(wsp_ggml_backend_t backend) {
+    return backend && backend->iface.get_name == wsp_ggml_backend_cpu_name;
 }
 void wsp_ggml_backend_cpu_set_n_threads(wsp_ggml_backend_t backend_cpu, int n_threads) {
@@ -597,11 +745,11 @@ void wsp_ggml_backend_cpu_set_n_threads(wsp_ggml_backend_t backend_cpu, int n_th
     ctx->n_threads = n_threads;
 }
-wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+WSP_GGML_CALL wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return wsp_ggml_backend_buffer_init(wsp_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }
-static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
+WSP_GGML_CALL static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
     return wsp_ggml_backend_cpu_init();
     WSP_GGML_UNUSED(params);
@@ -611,7 +759,7 @@ static wsp_ggml_backend_t wsp_ggml_backend_reg_cpu_init(const char * params, voi
 // scheduler
-#define WSP_GGML_MAX_BACKENDS 4
+#define WSP_GGML_MAX_BACKENDS 16
 #define WSP_GGML_MAX_SPLITS 256
 #define WSP_GGML_MAX_SPLIT_INPUTS 16
@@ -621,21 +769,29 @@ struct wsp_ggml_backend_sched_split {
     int i_end;
     struct wsp_ggml_tensor * inputs[WSP_GGML_MAX_SPLIT_INPUTS];
     int n_inputs;
+    // graph view of this split
     struct wsp_ggml_cgraph graph;
 };
 struct wsp_ggml_backend_sched {
+    bool is_reset; // true if the scheduler has been reset since the last graph split
     int n_backends;
     wsp_ggml_backend_t backends[WSP_GGML_MAX_BACKENDS];
+    wsp_ggml_backend_buffer_type_t bufts[WSP_GGML_MAX_BACKENDS];
     wsp_ggml_tallocr_t  tallocs[WSP_GGML_MAX_BACKENDS];
     wsp_ggml_gallocr_t galloc;
+    // hash keys of the nodes in the graph
     struct wsp_ggml_hash_set    hash_set;
-    wsp_ggml_tallocr_t *        node_talloc;                     // [hash_set.size]
-    struct wsp_ggml_tensor * (* node_copies)[WSP_GGML_MAX_BACKENDS]; // [hash_set.size][WSP_GGML_MAX_BACKENDS]
+    // hash values (arrays of [hash_set.size])
+    wsp_ggml_tallocr_t *        node_talloc;                     // tallocr assigned to each node (indirectly this is the backend)
+    struct wsp_ggml_tensor * (* node_copies)[WSP_GGML_MAX_BACKENDS]; // copies of each node for each destination backend
+    // copy of the graph with modified inputs
     struct wsp_ggml_cgraph * graph;
     struct wsp_ggml_backend_sched_split splits[WSP_GGML_MAX_SPLITS];
     int n_splits;
@@ -648,6 +804,9 @@ struct wsp_ggml_backend_sched {
     __attribute__((aligned(WSP_GGML_MEM_ALIGN)))
     #endif
     char context_buffer[WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS*sizeof(struct wsp_ggml_tensor) + sizeof(struct wsp_ggml_cgraph)];
+    wsp_ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
 };
 #define hash_id(node) wsp_ggml_hash_find_or_insert(sched->hash_set, node)
@@ -676,14 +835,22 @@ static int sched_allocr_prio(wsp_ggml_backend_sched_t sched, wsp_ggml_tallocr_t
     return INT_MAX;
 }
-static wsp_ggml_backend_t get_buffer_backend(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_buffer_t buffer) {
+static wsp_ggml_tallocr_t sched_allocr_from_buffer(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_buffer_t buffer) {
     if (buffer == NULL) {
         return NULL;
     }
+    // check if this is already allocate in a allocr buffer (from user manual allocations)
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (wsp_ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
+            return sched->tallocs[i];
+        }
+    }
     // find highest prio backend that supports the buffer type
     for (int i = 0; i < sched->n_backends; i++) {
         if (wsp_ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
-            return sched->backends[i];
+            return sched->tallocs[i];
         }
     }
     WSP_GGML_ASSERT(false && "tensor buffer type not supported by any backend");
@@ -693,7 +860,6 @@ static wsp_ggml_backend_t get_allocr_backend(wsp_ggml_backend_sched_t sched, wsp
     if (allocr == NULL) {
         return NULL;
     }
-    // find highest prio backend that supports the buffer type
     for (int i = 0; i < sched->n_backends; i++) {
         if (sched->tallocs[i] == allocr) {
             return sched->backends[i];
@@ -703,7 +869,7 @@ static wsp_ggml_backend_t get_allocr_backend(wsp_ggml_backend_sched_t sched, wsp
 }
 #if 0
-static char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*8 + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
+static char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*16 + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #define GET_CAUSE(node) causes[hash_id(node)]
 #else
@@ -712,45 +878,37 @@ static char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*8 + WSP_GGML_MAX_SPLITS*WSP_GGML_
 #endif
 // returns the backend that should be used for the node based on the current locations
-static wsp_ggml_backend_t sched_backend_from_cur(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
-    // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
-    // ie. kv cache updates
-    // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
+static wsp_ggml_tallocr_t sched_allocr_from_cur(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
+    // assign pre-allocated nodes to their backend
     // dst
-    wsp_ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
-    if (cur_backend != NULL) {
+    wsp_ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
+    if (cur_allocr != NULL) {
         SET_CAUSE(node, "1.dst");
-        return cur_backend;
+        return cur_allocr;
     }
     // view_src
-    if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
-        SET_CAUSE(node, "1.vsrc");
-        return get_buffer_backend(sched, node->view_src->buffer);
+    if (node->view_src != NULL) {
+        cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
+        if (cur_allocr != NULL) {
+            SET_CAUSE(node, "1.vsrc");
+            return cur_allocr;
+        }
     }
-    // src
-    int cur_prio = INT_MAX;
-    size_t cur_size = 0;
+    // assign nodes that use weights to the backend of the weights
     for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
         const struct wsp_ggml_tensor * src = node->src[i];
         if (src == NULL) {
             break;
         }
-        wsp_ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
-        if (src_backend != NULL) {
-            int src_prio = sched_backend_prio(sched, src_backend);
-            size_t src_size = wsp_ggml_nbytes(src);
-            if (src_prio < cur_prio && src_size >= cur_size) {
-                cur_prio = src_prio;
-                cur_size = src_size;
-                cur_backend = src_backend;
-                SET_CAUSE(node, "1.src%d", i);
-            }
+        if (src->buffer != NULL && src->buffer->usage == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            wsp_ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
+            // operations with weights are always run on the same backend as the weights
+            SET_CAUSE(node, "1.wgt%d", i);
+            return src_allocr;
         }
     }
-    return cur_backend;
+    return NULL;
 }
 static char * fmt_size(size_t size) {
@@ -783,7 +941,7 @@ static void sched_print_assignments(wsp_ggml_backend_sched_t sched, struct wsp_g
         }
         wsp_ggml_tallocr_t node_allocr = node_allocr(node);
         wsp_ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
+        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
             fmt_size(wsp_ggml_nbytes(node)), node_allocr ? wsp_ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
             struct wsp_ggml_tensor * src = node->src[j];
@@ -792,7 +950,7 @@ static void sched_print_assignments(wsp_ggml_backend_sched_t sched, struct wsp_g
             }
             wsp_ggml_tallocr_t src_allocr = node_allocr(src);
             wsp_ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
-            fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
+            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
                 fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
         fprintf(stderr, "\n");
@@ -808,15 +966,17 @@ static struct wsp_ggml_tensor * wsp_ggml_dup_tensor_layout(struct wsp_ggml_conte
     return dup;
 }
+//#define DEBUG_PASS1
+//#define DEBUG_PASS2
+//#define DEBUG_PASS3
+//#define DEBUG_PASS4
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-// TODO: merge passes
 static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
-    // reset state
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
-    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
-    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
+    // reset splits
     sched->n_splits = 0;
+    sched->is_reset = false;
     struct wsp_ggml_init_params params = {
         /* .mem_size =   */ sizeof(sched->context_buffer),
@@ -824,26 +984,22 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
         /* .no_alloc =   */ true
     };
-    if (sched->ctx != NULL) {
-        wsp_ggml_free(sched->ctx);
-    }
+    wsp_ggml_free(sched->ctx);
     sched->ctx = wsp_ggml_init(params);
+    if (sched->ctx == NULL) {
+        fprintf(stderr, "%s: failed to initialize context\n", __func__);
+        WSP_GGML_ASSERT(false);
+    }
-    // pass 1: assign backends to ops with allocated inputs
+    // pass 1: assign backends to ops with pre-allocated inputs
     for (int i = 0; i < graph->n_leafs; i++) {
         struct wsp_ggml_tensor * leaf = graph->leafs[i];
         if (node_allocr(leaf) != NULL) {
             // do not overwrite user assignments
             continue;
         }
-        wsp_ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
-        if (leaf_backend == NULL && leaf->view_src != NULL) {
-            leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
-        }
-        if (leaf_backend != NULL) {
-            node_allocr(leaf) = wsp_ggml_backend_sched_get_tallocr(sched, leaf_backend);
-        }
+        node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
     }
     for (int i = 0; i < graph->n_nodes; i++) {
@@ -852,50 +1008,120 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
             // do not overwrite user assignments
             continue;
         }
-        wsp_ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
-        if (node_backend != NULL) {
-            node_allocr(node) = wsp_ggml_backend_sched_get_tallocr(sched, node_backend);
+        node_allocr(node) = sched_allocr_from_cur(sched, node);
+        // src
+        for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
+            struct wsp_ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                break;
+            }
+            if (node_allocr(src) == NULL) {
+                node_allocr(src) = sched_allocr_from_cur(sched, src);
+            }
         }
     }
-    //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS1
+    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
-    // pass 2: assign backends to ops from current assignments
-    // TODO:
-    //  - reuse sched_backend_from_cur
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct wsp_ggml_tensor * node = graph->nodes[i];
-        wsp_ggml_tallocr_t node_allocr = node_allocr(node);
-        if (node_allocr == NULL) {
-            int    cur_prio = INT_MAX;
-            size_t cur_size = 0;
-            for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
-                struct wsp_ggml_tensor * src = node->src[j];
-                if (src == NULL) {
-                    break;
+    // pass 2: expand current backend assignments
+    // assign the same backend to adjacent nodes
+    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
+    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
+    // pass 2.1 expand gpu up
+    {
+        wsp_ggml_tallocr_t cur_allocr = NULL;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct wsp_ggml_tensor * node = graph->nodes[i];
+            if (wsp_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            wsp_ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_allocr = NULL;
+                } else {
+                    cur_allocr = node_allocr;
                 }
-                wsp_ggml_tallocr_t src_allocr = node_allocr(src);
-                if (src_allocr != NULL) {
-                    int    src_prio = sched_allocr_prio(sched, src_allocr);
-                    size_t src_size = wsp_ggml_nbytes(src);
-                    if (src_prio < cur_prio && src_size >= cur_size) {
-                        cur_prio = src_prio;
-                        cur_size = src_size;
-                        node_allocr = src_allocr;
-                        SET_CAUSE(node, "2.src%d", j);
-                    }
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.1");
+            }
+        }
+    }
+    // pass 2.2 expand gpu down
+    {
+        wsp_ggml_tallocr_t cur_allocr = NULL;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct wsp_ggml_tensor * node = graph->nodes[i];
+            if (wsp_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            wsp_ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_allocr = NULL;
+                } else {
+                    cur_allocr = node_allocr;
                 }
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.2");
+            }
+        }
+    }
+    // pass 2.3 expand rest up
+    {
+        wsp_ggml_tallocr_t cur_allocr = NULL;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct wsp_ggml_tensor * node = graph->nodes[i];
+            if (wsp_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            wsp_ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                cur_allocr = node_allocr;
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.3");
             }
+        }
+    }
+    // pass 2.4 expand rest down
+    {
+        wsp_ggml_tallocr_t cur_allocr = NULL;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct wsp_ggml_tensor * node = graph->nodes[i];
+            if (wsp_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            wsp_ggml_tallocr_t node_allocr = node_allocr(node);
             if (node_allocr != NULL) {
-                node_allocr(node) = node_allocr;
+                cur_allocr = node_allocr;
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.4");
             }
         }
     }
-    //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS2
+    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
-    // pass 3: assign backends to remaining src from dst (should only be leafs)
+    // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct wsp_ggml_tensor * node = graph->nodes[i];
-        wsp_ggml_tallocr_t node_allocr = node_allocr(node);
+        wsp_ggml_tallocr_t cur_allocr = node_allocr(node);
+        if (node->view_src != NULL && cur_allocr == NULL) {
+            cur_allocr = node_allocr(node) = node_allocr(node->view_src);
+            SET_CAUSE(node, "3.vsrc");
+        }
         for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
             struct wsp_ggml_tensor * src = node->src[j];
             if (src == NULL) {
@@ -903,81 +1129,107 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
             }
             wsp_ggml_tallocr_t src_allocr = node_allocr(src);
             if (src_allocr == NULL) {
-                node_allocr(src) = node_allocr;
+                if (src->view_src != NULL) {
+                    // views are always on the same backend as the source
+                    node_allocr(src) = node_allocr(src->view_src);
+                    SET_CAUSE(src, "3.vsrc");
+                } else {
+                    node_allocr(src) = cur_allocr;
+                    SET_CAUSE(src, "3.cur");
+                }
             }
         }
     }
-    //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS3
+    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
     // pass 4: split graph, find tensors that need to be copied
-    // TODO:
-    //  - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
-    // find first backend
-    int cur_split = 0;
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct wsp_ggml_tensor * node = graph->nodes[i];
-        if (node->view_src == NULL) {
-            sched->splits[0].tallocr = node_allocr(node);
-            break;
-        }
-    }
-    sched->splits[0].i_start = 0;
-    sched->splits[0].n_inputs = 0;
-    memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
-    wsp_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
-    size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct wsp_ggml_tensor * node = graph->nodes[i];
-        if (wsp_ggml_is_view_op(node->op)) {
-            continue;
+    {
+        int cur_split = 0;
+        // find the backend of the first split, skipping view ops
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct wsp_ggml_tensor * node = graph->nodes[i];
+            if (!wsp_ggml_is_view_op(node->op)) {
+                sched->splits[0].tallocr = node_allocr(node);
+                break;
+            }
         }
+        sched->splits[0].i_start = 0;
+        sched->splits[0].n_inputs = 0;
+        memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
+        wsp_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
+        size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct wsp_ggml_tensor * node = graph->nodes[i];
+            if (wsp_ggml_is_view_op(node->op)) {
+                continue;
+            }
-        wsp_ggml_tallocr_t node_allocr = node_allocr(node);
+            wsp_ggml_tallocr_t node_allocr = node_allocr(node);
-        if (node_allocr != cur_allocr) {
-            sched->splits[cur_split].i_end = i;
-            cur_split++;
-            WSP_GGML_ASSERT(cur_split < WSP_GGML_MAX_SPLITS);
-            sched->splits[cur_split].tallocr = node_allocr;
-            sched->splits[cur_split].i_start = i;
-            sched->splits[cur_split].n_inputs = 0;
-            memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
-            cur_allocr = node_allocr;
-            cur_backend_id = sched_allocr_prio(sched, cur_allocr);
-        }
+            WSP_GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
-        // find inputs that are not on the same backend
-        for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
-            struct wsp_ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                break;
+            if (node_allocr != cur_allocr) {
+                sched->splits[cur_split].i_end = i;
+                cur_split++;
+                WSP_GGML_ASSERT(cur_split < WSP_GGML_MAX_SPLITS);
+                sched->splits[cur_split].tallocr = node_allocr;
+                sched->splits[cur_split].i_start = i;
+                sched->splits[cur_split].n_inputs = 0;
+                cur_allocr = node_allocr;
+                cur_backend_id = sched_allocr_prio(sched, cur_allocr);
             }
-            wsp_ggml_tallocr_t src_allocr = node_allocr(src);
-            if (src_allocr != node_allocr) {
-                int n_inputs = sched->splits[cur_split].n_inputs++;
-                WSP_GGML_ASSERT(n_inputs < WSP_GGML_MAX_SPLIT_INPUTS);
-                sched->splits[cur_split].inputs[n_inputs] = (struct wsp_ggml_tensor *)src;
-                // create copies
-                size_t id = hash_id(src);
-                if (sched->node_copies[id][cur_backend_id] == NULL) {
-                    struct wsp_ggml_tensor * tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
-                    sched->node_copies[id][cur_backend_id] = tensor_copy;
-                    node_allocr(tensor_copy) = cur_allocr;
-                    wsp_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                    wsp_ggml_format_name(tensor_copy, "%s#%s", wsp_ggml_backend_name(backend), src->name);
+            // find inputs that are not on the same backend
+            for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
+                struct wsp_ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    break;
+                }
+                wsp_ggml_tallocr_t src_allocr = node_allocr(src);
+                WSP_GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
+                if (src_allocr != node_allocr) {
+                    // check if the input is already in the split
+                    bool found = false;
+                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
+                        if (sched->splits[cur_split].inputs[k] == src) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        int n_inputs = sched->splits[cur_split].n_inputs++;
+                        //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, wsp_ggml_backend_name(get_allocr_backend(sched, src_allocr)));
+                        WSP_GGML_ASSERT(n_inputs < WSP_GGML_MAX_SPLIT_INPUTS);
+                        sched->splits[cur_split].inputs[n_inputs] = src;
+                    }
+                    // create a copy of the input in the split's backend
+                    size_t id = hash_id(src);
+                    if (sched->node_copies[id][cur_backend_id] == NULL) {
+                        wsp_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                        struct wsp_ggml_tensor * tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
+                        wsp_ggml_format_name(tensor_copy, "%s#%s", wsp_ggml_backend_name(backend), src->name);
+                        sched->node_copies[id][cur_backend_id] = tensor_copy;
+                        node_allocr(tensor_copy) = cur_allocr;
+                        SET_CAUSE(tensor_copy, "4.cpy");
+                    }
+                    node->src[j] = sched->node_copies[id][cur_backend_id];
                 }
-                node->src[j] = sched->node_copies[id][cur_backend_id];
             }
         }
+        sched->splits[cur_split].i_end = graph->n_nodes;
+        sched->n_splits = cur_split + 1;
     }
-    sched->splits[cur_split].i_end = graph->n_nodes;
-    sched->n_splits = cur_split + 1;
-    //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
+#ifdef DEBUG_PASS4
+    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
-#if 1
+#ifndef NDEBUG
     // sanity check: all sources should have the same backend as the node
     for (int i = 0; i < graph->n_nodes; i++) {
         struct wsp_ggml_tensor * node = graph->nodes[i];
@@ -985,6 +1237,11 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
         if (node_allocr == NULL) {
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
         }
+        if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
+            fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
+                node->name, node_allocr ? wsp_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
+                node->view_src->name, node_allocr(node->view_src) ? wsp_ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
+        }
         for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
             struct wsp_ggml_tensor * src = node->src[j];
             if (src == NULL) {
@@ -996,8 +1253,14 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
                     node->name, node_allocr ? wsp_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
                     j, src->name, src_allocr ? wsp_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
             }
+            if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
+                fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
+                    src->name, src_allocr ? wsp_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
+                    src->view_src->name, node_allocr(src->view_src) ? wsp_ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
+            }
         }
     }
+    fflush(stderr);
 #endif
     // create copies of the graph for each split
@@ -1011,6 +1274,8 @@ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cg
         for (int j = 0; j < split->n_inputs; j++) {
             struct wsp_ggml_tensor * input = split->inputs[j];
             struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
+            // add a dependency to the input source so that it is not freed before the copy is done
+            WSP_GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
             input_cpy->src[0] = input;
             graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
         }
@@ -1045,24 +1310,16 @@ static void sched_compute_splits(wsp_ggml_backend_sched_t sched) {
         uint64_t copy_start_us = wsp_ggml_time_us();
         for (int j = 0; j < split->n_inputs; j++) {
             struct wsp_ggml_tensor * input = split->inputs[j];
-            struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
-            if (input->buffer == NULL) {
-                if (input->view_src == NULL) {
-                    fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
-                    exit(1);
-                }
-                // FIXME: may need to use the sched buffer instead
-                wsp_ggml_backend_view_init(input->view_src->buffer, input);
-            }
-            if (input_cpy->buffer == NULL) {
-                fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
-                exit(1);
-            }
-            //WSP_GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
-            //WSP_GGML_ASSERT(input_cpy->buffer->backend == split_backend);
-            wsp_ggml_backend_tensor_copy(input, input_cpy);
+            struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
+            WSP_GGML_ASSERT(input->buffer != NULL);
+            WSP_GGML_ASSERT(input_cpy->buffer != NULL);
+            // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
+            // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
+            wsp_ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
         }
-        // wsp_ggml_backend_synchronize(split_backend);
+        //wsp_ggml_backend_synchronize(split_backend); // necessary to measure copy time
         int64_t copy_end_us = wsp_ggml_time_us();
         copy_us[split_backend_id] += copy_end_us - copy_start_us;
@@ -1072,9 +1329,38 @@ static void sched_compute_splits(wsp_ggml_backend_sched_t sched) {
         wsp_ggml_graph_dump_dot(split->graph, NULL, split_filename);
 #endif
         uint64_t compute_start_us = wsp_ggml_time_us();
-        wsp_ggml_backend_graph_compute(split_backend, &split->graph);
-        // wsp_ggml_backend_synchronize(split_backend);
+        if (!sched->callback_eval) {
+            wsp_ggml_backend_graph_compute(split_backend, &split->graph);
+          //wsp_ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        } else {
+            // similar to wsp_ggml_backend_compare_graph_backend
+            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+                struct wsp_ggml_tensor * t = split->graph.nodes[j0];
+                // check if the user needs data from this node
+                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+                int j1 = j0;
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (!need && j1 < split->graph.n_nodes - 1) {
+                    t = split->graph.nodes[++j1];
+                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+                }
+                struct wsp_ggml_cgraph gv = wsp_ggml_graph_view(&split->graph, j0, j1 + 1);
+                wsp_ggml_backend_graph_compute(split_backend, &gv);
+                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                    break;
+                }
+                j0 = j1;
+            }
+        }
         uint64_t compute_end_us = wsp_ggml_time_us();
         compute_us[split_backend_id] += compute_end_us - compute_start_us;
     }
@@ -1094,26 +1380,41 @@ static void sched_reset(wsp_ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_backends; i++) {
         wsp_ggml_tallocr_reset(sched->tallocs[i]);
     }
+    // reset state for the next run
+    size_t hash_size = sched->hash_set.size;
+    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
+    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
+    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
+    sched->is_reset = true;
 }
-wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(wsp_ggml_backend_t * backends, int n_backends) {
+wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(wsp_ggml_backend_t * backends, wsp_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
+    WSP_GGML_ASSERT(n_backends > 0);
     WSP_GGML_ASSERT(n_backends <= WSP_GGML_MAX_BACKENDS);
-    struct wsp_ggml_backend_sched * sched = malloc(sizeof(struct wsp_ggml_backend_sched));
-    memset(sched, 0, sizeof(struct wsp_ggml_backend_sched));
+    struct wsp_ggml_backend_sched * sched = calloc(sizeof(struct wsp_ggml_backend_sched), 1);
+    // initialize hash table
+    sched->hash_set    = wsp_ggml_hash_set_new(graph_size + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS);
+    sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
+    sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
     sched->n_backends = n_backends;
     for (int i = 0; i < n_backends; i++) {
         sched->backends[i] = backends[i];
+        sched->bufts[i] = bufts ? bufts[i] : wsp_ggml_backend_get_default_buffer_type(backends[i]);
     }
     sched->galloc = wsp_ggml_gallocr_new();
     // init measure allocs for each backend
     for (int i = 0; i < n_backends; i++) {
-        sched->tallocs[i] = wsp_ggml_tallocr_new_measure_from_backend(backends[i]);
+        sched->tallocs[i] = wsp_ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
     }
+    sched_reset(sched);
     return sched;
 }
@@ -1125,6 +1426,7 @@ void wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched) {
         wsp_ggml_tallocr_free(sched->tallocs[i]);
     }
     wsp_ggml_gallocr_free(sched->galloc);
+    wsp_ggml_free(sched->ctx);
     free(sched->hash_set.keys);
     free(sched->node_talloc);
     free(sched->node_copies);
@@ -1132,12 +1434,7 @@ void wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched) {
 }
 void wsp_ggml_backend_sched_init_measure(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
-    // initialize hash tables
-    size_t hash_size = measure_graph->visited_hash_table.size + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS;
-    sched->hash_set.size = hash_size;
-    sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
-    sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size);
-    sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size);
+    WSP_GGML_ASSERT(wsp_ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
     sched_split_graph(sched, measure_graph);
     sched_alloc_splits(sched);
@@ -1146,28 +1443,47 @@ void wsp_ggml_backend_sched_init_measure(wsp_ggml_backend_sched_t sched, struct
     for (int i = 0; i < sched->n_backends; i++) {
         size_t size = wsp_ggml_tallocr_max_size(sched->tallocs[i]);
         wsp_ggml_tallocr_free(sched->tallocs[i]);
-        sched->tallocs[i] = wsp_ggml_tallocr_new_from_backend(sched->backends[i], size);
+        sched->tallocs[i] = wsp_ggml_tallocr_new_from_buft(sched->bufts[i], size);
     }
     sched_reset(sched);
 }
 void wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
-    WSP_GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS);
+    WSP_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS);
+    if (!sched->is_reset) {
+        sched_reset(sched);
+    }
     sched_split_graph(sched, graph);
     sched_alloc_splits(sched);
     sched_compute_splits(sched);
+}
+void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
     sched_reset(sched);
 }
+void wsp_ggml_backend_sched_set_eval_callback(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_sched_eval_callback callback, void * user_data) {
+    sched->callback_eval = callback;
+    sched->callback_eval_user_data = user_data;
+}
+int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched) {
+    return sched->n_splits;
+}
 wsp_ggml_tallocr_t wsp_ggml_backend_sched_get_tallocr(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
     int backend_index = sched_backend_prio(sched, backend);
+    WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     return sched->tallocs[backend_index];
 }
 wsp_ggml_backend_buffer_t wsp_ggml_backend_sched_get_buffer(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
     int backend_index = sched_backend_prio(sched, backend);
+    WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     return wsp_ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
 }
@@ -1177,10 +1493,19 @@ void wsp_ggml_backend_sched_set_node_backend(wsp_ggml_backend_sched_t sched, str
     node_allocr(node) = sched->tallocs[backend_index];
 }
+wsp_ggml_backend_t wsp_ggml_backend_sched_get_node_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
+    wsp_ggml_tallocr_t allocr = node_allocr(node);
+    if (allocr == NULL) {
+        return NULL;
+    }
+    return get_allocr_backend(sched, allocr);
+}
 // utils
 void wsp_ggml_backend_view_init(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
     WSP_GGML_ASSERT(tensor->buffer == NULL);
-    WSP_GGML_ASSERT(tensor->data == NULL);
+    //WSP_GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in wsp_ggml_new_tensor, but still need to be initialized by the backend
     WSP_GGML_ASSERT(tensor->view_src != NULL);
     WSP_GGML_ASSERT(tensor->view_src->buffer != NULL);
     WSP_GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1246,6 +1571,7 @@ static void graph_init_tensor(struct wsp_ggml_hash_set hash_set, struct wsp_ggml
     struct wsp_ggml_tensor * dst = node_copies[id];
     if (dst->view_src != NULL) {
+        graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
         wsp_ggml_backend_view_init(dst->view_src->buffer, dst);
     }
     else {
@@ -1279,6 +1605,21 @@ struct wsp_ggml_backend_graph_copy wsp_ggml_backend_graph_copy(wsp_ggml_backend_
     struct wsp_ggml_context * ctx_allocated = wsp_ggml_init(params);
     struct wsp_ggml_context * ctx_unallocated = wsp_ggml_init(params);
+    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
+        fprintf(stderr, "failed to allocate context for graph copy\n");
+        free(hash_set.keys);
+        free(node_copies);
+        free(node_init);
+        wsp_ggml_free(ctx_allocated);
+        wsp_ggml_free(ctx_unallocated);
+        return (struct wsp_ggml_backend_graph_copy) {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
     // dup nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct wsp_ggml_tensor * node = graph->nodes[i];
@@ -1287,6 +1628,20 @@ struct wsp_ggml_backend_graph_copy wsp_ggml_backend_graph_copy(wsp_ggml_backend_
     // allocate nodes
     wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+    if (buffer == NULL) {
+        fprintf(stderr, "failed to allocate buffer for graph copy\n");
+        free(hash_set.keys);
+        free(node_copies);
+        free(node_init);
+        wsp_ggml_free(ctx_allocated);
+        wsp_ggml_free(ctx_unallocated);
+        return (struct wsp_ggml_backend_graph_copy) {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
     //printf("copy buffer size: %zu MB\n", wsp_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
@@ -1323,8 +1678,12 @@ void wsp_ggml_backend_graph_copy_free(struct wsp_ggml_backend_graph_copy copy) {
     wsp_ggml_free(copy.ctx_unallocated);
 }
-void wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data) {
+bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data) {
     struct wsp_ggml_backend_graph_copy copy = wsp_ggml_backend_graph_copy(backend2, graph);
+    if (copy.buffer == NULL) {
+        return false;
+    }
     struct wsp_ggml_cgraph * g1 = graph;
     struct wsp_ggml_cgraph * g2 = copy.graph;
@@ -1354,4 +1713,6 @@ void wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
     }
     wsp_ggml_backend_graph_copy_free(copy);
+    return true;
 }