npm - cui-llama.rn - Versions diffs - 1.2.3 → 1.2.6 - Mend

cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +0 -2
package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
package/android/src/main/jni.cpp +9 -11
package/cpp/common.cpp +85 -75
package/cpp/common.h +127 -91
package/cpp/ggml-aarch64.c +269 -0
package/cpp/ggml-alloc.c +17 -19
package/cpp/ggml-backend-impl.h +4 -15
package/cpp/ggml-backend.cpp +1697 -1626
package/cpp/ggml-backend.h +13 -25
package/cpp/ggml-cpp.h +38 -0
package/cpp/ggml-cpu.c +13720 -0
package/cpp/ggml-cpu.h +150 -0
package/cpp/ggml-impl.h +95 -0
package/cpp/ggml-metal.m +185 -71
package/cpp/ggml-quants.c +38 -51
package/cpp/ggml.c +4468 -19500
package/cpp/ggml.h +26 -146
package/cpp/json-schema-to-grammar.cpp +1 -1
package/cpp/llama-sampling.cpp +742 -249
package/cpp/llama-sampling.h +21 -2
package/cpp/llama-vocab.cpp +49 -9
package/cpp/llama-vocab.h +35 -11
package/cpp/llama.cpp +2468 -2307
package/cpp/llama.h +65 -32
package/cpp/log.cpp +50 -50
package/cpp/log.h +18 -18
package/cpp/rn-llama.hpp +23 -22
package/cpp/sampling.cpp +117 -118
package/cpp/sampling.h +20 -20
package/cpp/sgemm.cpp +57 -0
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +0 -1
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +0 -1

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #include <windows.h>
 #endif
+#include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-alloc.h"
 #include "ggml-impl.h"
@@ -34,6 +35,11 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
 }
 lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
+    if (size == 0) {
+        // return a dummy buffer for zero-sized allocations
+        return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
+    }
     return buft->iface.alloc_buffer(buft, size);
 }
@@ -89,7 +95,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
 }
 const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name(buffer);
+    return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
 }
 void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
@@ -108,6 +114,11 @@ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
 }
 void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
+    // get_base is optional if the buffer is zero-sized
+    if (buffer->size == 0) {
+        return NULL;
+    }
     void * base = buffer->iface.get_base(buffer);
     LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +133,15 @@ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct
     }
 }
+void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
+    // clear is optional if the buffer is zero-sized
+    if (buffer->size == 0) {
+        return;
+    }
+    buffer->iface.clear(buffer, value);
+}
 size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
     return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
 }
@@ -134,10 +154,6 @@ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, st
     return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
 }
-void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
-    buffer->iface.clear(buffer, value);
-}
 bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
     return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
 }
@@ -198,7 +214,7 @@ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
 }
 lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
-    return backend->iface.get_default_buffer_type(backend);
+    return lm_ggml_backend_dev_buffer_type(backend->device);
 }
 lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
@@ -238,43 +254,42 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
 void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    if (size == 0) {
+        return;
+    }
     LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
     LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
-    if (!size) {
-        return;
-    }
     buf->iface.set_tensor(buf, tensor, data, offset, size);
 }
 void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    if (size == 0) {
+        return;
+    }
     LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
     LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
     LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
-    if (!size) {
-        return;
-    }
     buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
 LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
     lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-    LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
-    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
-    if (!size) {
+    if (size == 0) {
         return;
     }
-    LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
+    LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
+    LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
     buf->iface.memset_tensor(buf, tensor, value, offset, size);
 }
@@ -316,33 +331,15 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
 }
 bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
-    // helper to ease transition to device interface
-    if (backend->device) {
-        return lm_ggml_backend_dev_supports_op(backend->device, op);
-    }
-    return backend->iface.supports_op(backend, op);
+    return lm_ggml_backend_dev_supports_op(backend->device, op);
 }
 bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
-    // helper to ease transition to device interface
-    if (backend->device) {
-        return lm_ggml_backend_dev_supports_buft(backend->device, buft);
-    }
-    return backend->iface.supports_buft(backend, buft);
+    return lm_ggml_backend_dev_supports_buft(backend->device, buft);
 }
 bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
-    // helper to ease transition to device interface
-    if (backend->device) {
-        return lm_ggml_backend_dev_offload_op(backend->device, op);
-    }
-    if (backend->iface.offload_op != NULL) {
-        return backend->iface.offload_op(backend, op);
-    }
-    return false;
+    return lm_ggml_backend_dev_offload_op(backend->device, op);
 }
 lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
@@ -379,7 +376,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
         lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
     } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
+        LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
 #endif
         size_t nbytes = lm_ggml_nbytes(src);
         void * data = malloc(nbytes);
@@ -538,10 +535,40 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
 #include "ggml-metal.h"
 #endif
+#ifdef LM_GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+#ifdef LM_GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
 #ifdef LM_GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
+#ifdef LM_GGML_USE_RPC
+#include "ggml-rpc.h"
+#endif
+#ifndef __AMX_INT8__
+#undef LM_GGML_USE_AMX
+#endif
+#ifdef LM_GGML_USE_AMX
+#  include "ggml-amx.h"
+#endif
+#ifdef LM_GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+#ifdef LM_GGML_USE_KOMPUTE
+#include "ggml-kompute.h"
+#endif
+#include "ggml-cpu.h"
 struct lm_ggml_backend_registry {
     std::vector<lm_ggml_backend_reg_t> backends;
     std::vector<lm_ggml_backend_dev_t> devices;
@@ -553,18 +580,34 @@ struct lm_ggml_backend_registry {
 #ifdef LM_GGML_USE_METAL
         register_backend(lm_ggml_backend_metal_reg());
 #endif
+#ifdef LM_GGML_USE_SYCL
+        register_backend(lm_ggml_backend_sycl_reg());
+#endif
+#ifdef LM_GGML_USE_VULKAN
+        register_backend(lm_ggml_backend_vk_reg());
+#endif
+#ifdef LM_GGML_USE_CANN
+        register_backend(lm_ggml_backend_cann_reg());
+#endif
 #ifdef LM_GGML_USE_BLAS
         register_backend(lm_ggml_backend_blas_reg());
 #endif
-        // TODO: sycl, vulkan, kompute, cann
+#ifdef LM_GGML_USE_RPC
+        register_backend(lm_ggml_backend_rpc_reg());
+#endif
+#ifdef LM_GGML_USE_AMX
+        register_backend(lm_ggml_backend_amx_reg());
+#endif
+#ifdef LM_GGML_USE_KOMPUTE
+        register_backend(lm_ggml_backend_kompute_reg());
+#endif
         register_backend(lm_ggml_backend_cpu_reg());
     }
     void register_backend(lm_ggml_backend_reg_t reg) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
+        LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
             __func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
 #endif
         backends.push_back(reg);
@@ -575,7 +618,7 @@ struct lm_ggml_backend_registry {
     void register_device(lm_ggml_backend_dev_t device) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
+        LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
 #endif
         devices.push_back(device);
     }
@@ -663,9 +706,9 @@ lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type typ
 }
 lm_ggml_backend_t lm_ggml_backend_init_best(void) {
-    lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
+    lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
     if (!dev) {
-        dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
+        dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
     }
     if (!dev) {
         return NULL;
@@ -673,1918 +716,1946 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
     return lm_ggml_backend_dev_init(dev, NULL);
 }
-// backend CPU
+// multi-buffer buffer
-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
+struct lm_ggml_backend_multi_buffer_context {
+    lm_ggml_backend_buffer_t * buffers;
+    size_t n_buffers;
+};
-static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
-    return "CPU";
+static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
+    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        lm_ggml_backend_buffer_free(ctx->buffers[i]);
+    }
-    LM_GGML_UNUSED(buffer);
+    free(ctx->buffers);
+    free(ctx);
 }
-static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
-    uintptr_t data = (uintptr_t)buffer->context;
-    // align the buffer
-    if (data % TENSOR_ALIGNMENT != 0) {
-        data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
+static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
+    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
     }
-    return (void *)data;
 }
-static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
+static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
+    /* .free_buffer     = */ lm_ggml_backend_multi_buffer_free_buffer,
+    /* .get_base        = */ NULL,
+    /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ NULL,
+    /* .get_tensor      = */ NULL,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ lm_ggml_backend_multi_buffer_clear,
+    /* .reset           = */ NULL,
+};
-static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
+lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
+    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
+    ctx->n_buffers = n_buffers;
+    ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
-    LM_GGML_UNUSED(buffer);
-}
+    LM_GGML_ASSERT(ctx->buffers != NULL);
-static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
+    size_t total_size = 0;
+    for (size_t i = 0; i < n_buffers; i++) {
+        ctx->buffers[i] = buffers[i];
+        total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
+    }
-    LM_GGML_UNUSED(buffer);
+    return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
 }
-static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-    LM_GGML_UNUSED(buffer);
+bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
 }
-static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
-    if (lm_ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, lm_ggml_nbytes(src));
-        return true;
+void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
+    LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
+    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
     }
-    return false;
+}
-    LM_GGML_UNUSED(buffer);
+// creates a copy of the tensor with the same memory layout
+static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
+    struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
+    for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
+        dup->nb[i] = tensor->nb[i];
+    }
+    return dup;
 }
-static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
+static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
+    return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
 }
-static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
-    /* .get_name        = */ lm_ggml_backend_cpu_buffer_get_name,
-    /* .free_buffer     = */ lm_ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base        = */ lm_ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ lm_ggml_backend_cpu_buffer_memset_tensor,
-    /* .set_tensor      = */ lm_ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ lm_ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ lm_ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
+// scheduler
-static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
-    /* .get_name        = */ lm_ggml_backend_cpu_buffer_get_name,
-    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base        = */ lm_ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .memset_tensor   = */ lm_ggml_backend_cpu_buffer_memset_tensor,
-    /* .set_tensor      = */ lm_ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ lm_ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ lm_ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
+#ifndef LM_GGML_SCHED_MAX_BACKENDS
+#define LM_GGML_SCHED_MAX_BACKENDS 16
+#endif
-static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
-    return "CPU";
+#ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
+#define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
+#endif
-    LM_GGML_UNUSED(buft);
-}
+#ifndef LM_GGML_SCHED_MAX_COPIES
+#define LM_GGML_SCHED_MAX_COPIES 4
+#endif
-static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
+struct lm_ggml_backend_sched_split {
+    int backend_id;
+    int i_start;
+    int i_end;
+    struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
+    int n_inputs;
+    // graph view of this split
+    struct lm_ggml_cgraph graph;
+};
-    return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
-}
+struct lm_ggml_backend_sched {
+    bool is_reset; // true if the scheduler has been reset since the last graph split
+    bool is_alloc;
-static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
+    int n_backends;
-    LM_GGML_UNUSED(buft);
-}
+    lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
+    lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
+    lm_ggml_gallocr_t galloc;
-static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
-    return true;
+    // hash map of the nodes in the graph
+    struct lm_ggml_hash_set  hash_set;
+    int                 * hv_tensor_backend_ids; // [hash_set.size]
+    struct lm_ggml_tensor ** hv_tensor_copies;      // [hash_set.size][n_backends][n_copies]
-    LM_GGML_UNUSED(buft);
-}
+    int * node_backend_ids; // [graph_size]
+    int * leaf_backend_ids; // [graph_size]
-lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
-    static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
-        /* .iface   = */ {
-            /* .get_name         = */ lm_ggml_backend_cpu_buffer_type_get_name,
-            /* .alloc_buffer     = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
-            /* .is_host          = */ lm_ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .device  = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
-        /* .context = */ NULL,
-    };
+    int * prev_node_backend_ids; // [graph_size]
+    int * prev_leaf_backend_ids; // [graph_size]
-    return &lm_ggml_backend_cpu_buffer_type;
-}
+    // copy of the graph with modified inputs
+    struct lm_ggml_cgraph graph;
-#ifdef LM_GGML_USE_CPU_HBM
+    // graph splits
+    struct lm_ggml_backend_sched_split * splits;
+    int n_splits;
+    int splits_capacity;
-// buffer type HBM
+    // pipeline parallelism support
+    int n_copies;
+    int cur_copy;
+    lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
+    struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
+    int n_graph_inputs;
-#include <hbwmalloc.h>
+    struct lm_ggml_context * ctx;
-static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
+    lm_ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
-    LM_GGML_UNUSED(buft);
-}
+    char * context_buffer;
+    size_t context_buffer_size;
-static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
-    return "CPU_HBM";
+    int debug;
+};
-    LM_GGML_UNUSED(buf);
-}
+#define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
+#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
+#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
+#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
-static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
+// returns the priority of the backend, lower id is higher priority
+static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (sched->backends[i] == backend) {
+            return i;
+        }
+    }
+    return -1;
 }
-static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
-    //void * ptr = hbw_malloc(size);
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
+static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
+    lm_ggml_backend_buffer_t buffer = tensor->buffer;
+    if (buffer == NULL) {
+        return -1;
     }
-    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = lm_ggml_backend_cpu_hbm_buffer_get_name;
-    buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
-    return buffer;
-}
+    // find highest prio backend that supports the buffer type and the op
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
+            lm_ggml_backend_supports_op(sched->backends[i], op)) {
+            return i;
+        }
+    }
-lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
-            /* .is_host          = */ lm_ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
+#ifndef NDEBUG
+    LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
+        __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
+#endif
-    return &lm_ggml_backend_cpu_buffer_type_hbm;
+    return -1;
 }
-#endif
-struct lm_ggml_backend_cpu_context {
-    int                 n_threads;
-    lm_ggml_threadpool_t   threadpool;
+#if 0
+#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
+static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
+#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
+#define GET_CAUSE(node) causes[hash_id(node)]
+#else
+#define SET_CAUSE(node, ...)
+#define GET_CAUSE(node) ""
+#endif
-    uint8_t *           work_data;
-    size_t              work_size;
+// returns the backend that should be used for the node based on the current locations
+static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
+    // TODO: use supports_op to check if the backend supports the op
-    lm_ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
+    // assign pre-allocated nodes to their backend
+    int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
+    if (cur_backend_id != -1) {
+        SET_CAUSE(tensor, "1.dst");
+        return cur_backend_id;
+    }
-static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
-    return "CPU";
+    // view_src
+    if (tensor->view_src != NULL) {
+        cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
+        if (cur_backend_id != -1) {
+            SET_CAUSE(tensor, "1.vsrc");
+            return cur_backend_id;
+        }
+    }
-    LM_GGML_UNUSED(backend);
-}
+    if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
+        // since the tensor is pre-allocated, it cannot be moved to another backend
+        LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
+    }
-static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
-    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
-    delete[] cpu_ctx->work_data;
-    delete cpu_ctx;
-    delete backend;
-}
+    // graph input
+    if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
+        cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
+        SET_CAUSE(tensor, "1.inp");
+        return cur_backend_id;
+    }
-static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
-    return lm_ggml_backend_cpu_buffer_type();
+    // operations with weights are preferably run on the same backend as the weights
+    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
+        const struct lm_ggml_tensor * src = tensor->src[i];
+        if (src == NULL) {
+            continue;
+        }
+        // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
+        // not an ideal solution
+        if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
+            // check if a backend with higher prio wants to offload the op
+            if (src_backend_id == sched->n_backends - 1) {
+                for (int b = 0; b < src_backend_id; b++) {
+                    if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
+                        SET_CAUSE(tensor, "1.off");
+                        return b;
+                    }
+                }
+            }
+            SET_CAUSE(tensor, "1.wgt%d", i);
+            return src_backend_id;
+        }
+    }
-    LM_GGML_UNUSED(backend);
+    return -1;
 }
-struct lm_ggml_backend_plan_cpu {
-    struct lm_ggml_cplan cplan;
-    struct lm_ggml_cgraph cgraph;
-};
-static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
-    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
-    struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
-    cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+static char * fmt_size(size_t size) {
+    static char buffer[128];
+    if (size >= 1024*1024) {
+        snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
+    } else {
+        snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
+    }
+    return buffer;
+}
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
-        if (cpu_plan->cplan.work_data == NULL) {
-            delete cpu_plan;
-            return NULL;
+static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    int cur_split = 0;
+    for (int i = 0; i < graph->n_nodes; i++) {
+        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
+            lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
+            LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
+                sched->splits[cur_split].n_inputs);
+            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
+                    fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
+            }
+            LM_GGML_LOG_DEBUG("\n");
+            cur_split++;
+        }
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        if (lm_ggml_is_view_op(node->op)) {
+            continue;
+        }
+        if (sched->debug > 1) {
+            lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
+            LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
+                fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
+            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                struct lm_ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+                lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
+                LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
+                    fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
+            }
+            LM_GGML_LOG_DEBUG("\n");
         }
     }
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-    return cpu_plan;
 }
-static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
-    struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
+static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
+    lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
+    lm_ggml_backend_buffer_type_t buft = NULL;
-    delete[] cpu_plan->cplan.work_data;
-    delete cpu_plan;
+    if (buf) {
+        // the tensor is already allocated
+        buft = buf->buft;
+    } else {
+        // see if the tensor already has a backend assigned, and use the buffer type of that backend
+        int tensor_backend_id = tensor_backend_id(t);
+        if (tensor_backend_id == -1 && t->view_src) {
+            tensor_backend_id = tensor_backend_id(t->view_src);
+        }
+        if (tensor_backend_id != -1) {
+            buft = sched->bufts[tensor_backend_id];
+        }
+    }
-    LM_GGML_UNUSED(backend);
+    return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
 }
-static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
-    struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
-    return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-    LM_GGML_UNUSED(backend);
+static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+    if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
+        *node_backend_id = cur_backend_id;
+        SET_CAUSE(node, "2.sup");
+    }
 }
-static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
-    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
+// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
+static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    // reset splits
+    sched->n_splits = 0;
+    sched->n_graph_inputs = 0;
+    sched->is_reset = false;
-    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    struct lm_ggml_init_params params = {
+        /* .mem_size =   */ sched->context_buffer_size,
+        /* .mem_buffer = */ sched->context_buffer,
+        /* .no_alloc =   */ true
+    };
-    if (cpu_ctx->work_size < cplan.work_size) {
-        delete[] cpu_ctx->work_data;
-        cpu_ctx->work_data = new uint8_t[cplan.work_size];
-        if (cpu_ctx->work_data == NULL) {
-            cpu_ctx->work_size = 0;
-            return LM_GGML_STATUS_ALLOC_FAILED;
-        }
-        cpu_ctx->work_size = cplan.work_size;
+    lm_ggml_free(sched->ctx);
+    sched->ctx = lm_ggml_init(params);
+    if (sched->ctx == NULL) {
+        LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
     }
-    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-    return lm_ggml_graph_compute(cgraph, &cplan);
-}
-static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
-    /* .get_name                = */ lm_ggml_backend_cpu_get_name,
-    /* .free                    = */ lm_ggml_backend_cpu_free,
-    /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ lm_ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ lm_ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ lm_ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ lm_ggml_backend_cpu_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
-    static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
-    return &guid;
-}
-lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
-    struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
-    if (ctx == NULL) {
-        return NULL;
-    }
-    ctx->n_threads           = LM_GGML_DEFAULT_N_THREADS;
-    ctx->threadpool          = NULL;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
-    ctx->abort_callback      = NULL;
-    ctx->abort_callback_data = NULL;
-    lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
-        /* .guid      = */ lm_ggml_backend_cpu_guid(),
-        /* .interface = */ lm_ggml_backend_cpu_i,
-        /* .device    = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
-        /* .context   = */ ctx,
-    };
-    if (cpu_backend == NULL) {
-        delete ctx;
-        return NULL;
+    // pass 1: assign backends to ops with pre-allocated inputs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct lm_ggml_tensor * leaf = graph->leafs[i];
+        int * leaf_backend_id = &tensor_backend_id(leaf);
+        // do not overwrite user assignments
+        if (*leaf_backend_id == -1) {
+            *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
+        }
     }
-    return cpu_backend;
-}
-bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
-    return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
-}
-void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
-    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
-    struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
-    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        int * node_backend_id = &tensor_backend_id(node);
+        // do not overwrite user assignments
+        if (*node_backend_id == -1) {
+            *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
-    struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
+#if 0
+            // src
+            if (node->op == LM_GGML_OP_NONE) {
+                continue;
+            }
-    if (ctx->threadpool && ctx->threadpool != threadpool) {
-        // already had a different threadpool, pause/suspend it before switching
-        lm_ggml_threadpool_pause(ctx->threadpool);
+            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                struct lm_ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+                int * src_backend_id = &tensor_backend_id(src);
+                if (*src_backend_id == -1) {
+                    *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
+                }
+            }
+#endif
+        }
     }
-    ctx->threadpool = threadpool;
-}
-void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
-    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
-    struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->abort_callback = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-}
-lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
-    LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
-    return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
-}
-////////////////////////
-struct lm_ggml_backend_cpu_device_context {
-    std::string description = "CPU";
-    lm_ggml_backend_cpu_device_context() {
-#ifdef __APPLE__
-        size_t len = 0;
-        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
-            description.resize(len);
-            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
-        }
-#elif defined(__linux__)
-        FILE * f = fopen("/proc/cpuinfo", "r");
-        if (f) {
-            char buf[1024];
-            while (fgets(buf, sizeof(buf), f)) {
-                if (strncmp(buf, "model name", 10) == 0) {
-                    char * p = strchr(buf, ':');
-                    if (p) {
-                        p++;
-                        while (std::isspace(*p)) {
-                            p++;
-                        }
-                        while (std::isspace(p[strlen(p) - 1])) {
-                            p[strlen(p) - 1] = '\0';
-                        }
-                        description = p;
-                        break;
-                    }
+    // pass 2: expand current backend assignments
+    // assign the same backend to adjacent nodes
+    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
+    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
+    // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
+    // expand gpu down
+    {
+        int cur_backend_id = -1;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct lm_ggml_tensor * node = graph->nodes[i];
+            if (lm_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                if (*node_backend_id == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
+                } else {
+                    cur_backend_id = *node_backend_id;
                 }
+            } else if (cur_backend_id != -1) {
+                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
             }
-            fclose(f);
         }
-#elif defined(_WIN32)
-        HKEY hKey;
-        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                        0,
-                        KEY_READ,
-                        &hKey) == ERROR_SUCCESS) {
-            DWORD cpu_brand_size = 0;
-            if (RegQueryValueExA(hKey,
-                                TEXT("ProcessorNameString"),
-                                NULL,
-                                NULL,
-                                NULL,
-                                &cpu_brand_size) == ERROR_SUCCESS) {
-                description.resize(cpu_brand_size);
-                if (RegQueryValueExA(hKey,
-                                    TEXT("ProcessorNameString"),
-                                    NULL,
-                                    NULL,
-                                    (LPBYTE)&description[0], // NOLINT
-                                    &cpu_brand_size) == ERROR_SUCCESS) {
-                    if (description.find('\0') != std::string::npos) {
-                        description.resize(description.find('\0'));
-                    }
+    }
+    // expand gpu up
+    {
+        int cur_backend_id = -1;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct lm_ggml_tensor * node = graph->nodes[i];
+            if (lm_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                if (*node_backend_id == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
+                } else {
+                    cur_backend_id = *node_backend_id;
                 }
+            } else if (cur_backend_id != -1) {
+                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+            }
+        }
+    }
+    // expand rest down
+    {
+        int cur_backend_id = -1;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct lm_ggml_tensor * node = graph->nodes[i];
+            if (lm_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                cur_backend_id = *node_backend_id;
+            } else if (cur_backend_id != -1) {
+                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+            }
+        }
+    }
+    // expand rest up
+    {
+        int cur_backend_id = -1;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct lm_ggml_tensor * node = graph->nodes[i];
+            if (lm_ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                cur_backend_id = *node_backend_id;
+            } else if (cur_backend_id != -1) {
+                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
             }
-            RegCloseKey(hKey);
         }
-#endif
     }
-};
-static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
-    return "CPU";
-    LM_GGML_UNUSED(dev);
-}
-static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
-    struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
-    return ctx->description.c_str();
-}
-static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
-    LM_GGML_UNUSED(dev);
-}
-static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
-    return LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
-    LM_GGML_UNUSED(dev);
-}
-static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
-    props->name        = lm_ggml_backend_cpu_device_get_name(dev);
-    props->description = lm_ggml_backend_cpu_device_get_description(dev);
-    props->type        = lm_ggml_backend_cpu_device_get_type(dev);
-    lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
-    };
-}
-static lm_ggml_backend_t lm_ggml_backend_cpu_device_init(lm_ggml_backend_dev_t dev, const char * params) {
-    return lm_ggml_backend_cpu_init();
-    LM_GGML_UNUSED(dev);
-    LM_GGML_UNUSED(params);
-}
-static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
-    return lm_ggml_backend_cpu_buffer_type();
-    LM_GGML_UNUSED(dev);
-}
-static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    LM_GGML_UNUSED(dev);
-    LM_GGML_UNUSED(max_tensor_size);
-}
-static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
-    switch (op->op) {
-        case LM_GGML_OP_CPY:
-            return
-                op->type != LM_GGML_TYPE_IQ2_XXS &&
-                op->type != LM_GGML_TYPE_IQ2_XS  &&
-                op->type != LM_GGML_TYPE_IQ1_S   &&
-                op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
-        case LM_GGML_OP_MUL_MAT:
-            return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
-        case LM_GGML_OP_ROPE_BACK:
-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
-        case LM_GGML_OP_IM2COL_BACK:
-            return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
-        case LM_GGML_OP_OUT_PROD:
-            return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
-        default:
-            return true;
+    // pass 3: upgrade nodes to higher prio backends with compatible buffer types
+    // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
+    // however, we also need to verify that the sources are in compatible buffer types
+    // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
+    // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
+    // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
+    // additionally, set remaining unassigned nodes to the backend with the most supported inputs
+    // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        if (lm_ggml_is_view_op(node->op)) {
+            continue;
+        }
+        int * node_backend_id = &tensor_backend_id(node);
+        if (*node_backend_id == -1) {
+            // unassigned node: find the backend with the most supported inputs
+            int n_supported_best = -1;
+            for (int b = 0; b < sched->n_backends; b++) {
+                if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
+                    int n_supported = 0;
+                    for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                        struct lm_ggml_tensor * src = node->src[j];
+                        if (src == NULL) {
+                            continue;
+                        }
+                        if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
+                            n_supported++;
+                        }
+                    }
+                    if (n_supported > n_supported_best) {
+                        n_supported_best = n_supported;
+                        *node_backend_id = b;
+                        SET_CAUSE(node, "3.best");
+                    }
+                }
+            }
+        } else {
+            // assigned node: upgrade to higher prio backend if possible
+            for (int b = 0; b < *node_backend_id; b++) {
+                if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
+                    bool supported = true;
+                    for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                        struct lm_ggml_tensor * src = node->src[j];
+                        if (src == NULL) {
+                            continue;
+                        }
+                        if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
+                            supported = false;
+                            break;
+                        }
+                    }
+                    if (supported) {
+                        *node_backend_id = b;
+                        SET_CAUSE(node, "3.upg");
+                        break;
+                    }
+                }
+            }
+        }
     }
-    LM_GGML_UNUSED(dev);
-}
-static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
-    return lm_ggml_backend_buft_is_host(buft);
-    LM_GGML_UNUSED(dev);
-}
-static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
-    /* .get_name             = */ lm_ggml_backend_cpu_device_get_name,
-    /* .get_description      = */ lm_ggml_backend_cpu_device_get_description,
-    /* .get_memory           = */ lm_ggml_backend_cpu_device_get_memory,
-    /* .get_type             = */ lm_ggml_backend_cpu_device_get_type,
-    /* .get_props            = */ lm_ggml_backend_cpu_device_get_props,
-    /* .init_backend         = */ lm_ggml_backend_cpu_device_init,
-    /* .get_buffer_type      = */ lm_ggml_backend_cpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_ptr,
-    /* .supports_op          = */ lm_ggml_backend_cpu_device_supports_op,
-    /* .supports_buft        = */ lm_ggml_backend_cpu_device_supports_buft,
-    /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-////////////////////////
-static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
-    return "CPU";
-    LM_GGML_UNUSED(reg);
-}
-static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
-    return 1;
-    LM_GGML_UNUSED(reg);
-}
+    // pass 4: assign backends to remaining src from dst and view_src
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        int * cur_backend_id = &tensor_backend_id(node);
+        if (node->view_src != NULL && *cur_backend_id == -1) {
+            *cur_backend_id = tensor_backend_id(node->view_src);
+            SET_CAUSE(node, "4.vsrc");
+        }
+        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+            struct lm_ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                continue;
+            }
+            int * src_backend_id = &tensor_backend_id(src);
+            if (*src_backend_id == -1) {
+                if (src->view_src != NULL) {
+                    // views are always on the same backend as the source
+                    *src_backend_id = tensor_backend_id(src->view_src);
+                    SET_CAUSE(src, "4.vsrc");
+                } else {
+                    *src_backend_id = *cur_backend_id;
+                    SET_CAUSE(src, "4.cur");
+                }
+            }
+        }
+    }
-static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
-    LM_GGML_ASSERT(index == 0);
+    // pass 5: split graph, find tensors that need to be copied
+    {
+        int i_split = 0;
+        struct lm_ggml_backend_sched_split * split = &sched->splits[0];
+        // find the backend of the first split, skipping view ops
+        int i = 0;
+        for (; i < graph->n_nodes; i++) {
+            struct lm_ggml_tensor * node = graph->nodes[i];
+            if (!lm_ggml_is_view_op(node->op)) {
+                split->backend_id = tensor_backend_id(node);
+                break;
+            }
+        }
+        split->i_start = 0;
+        split->n_inputs = 0;
+        int cur_backend_id = split->backend_id;
+        for (; i < graph->n_nodes; i++) {
+            struct lm_ggml_tensor * node = graph->nodes[i];
-    static lm_ggml_backend_cpu_device_context ctx;
-    static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
-        /* .iface   = */ lm_ggml_backend_cpu_device_i,
-        /* .reg     = */ reg,
-        /* .context = */ &ctx,
-    };
+            if (lm_ggml_is_view_op(node->op)) {
+                continue;
+            }
-    return &lm_ggml_backend_cpu_device;
-}
+            const int node_backend_id = tensor_backend_id(node);
-static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
-        return (void *)lm_ggml_backend_cpu_set_n_threads;
-    }
-    return NULL;
+            assert(node_backend_id != -1); // all nodes should be assigned by now
-    LM_GGML_UNUSED(reg);
-}
+            // check if we should start a new split based on the sources of the current node
+            bool need_new_split = false;
+            if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
+                for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                    struct lm_ggml_tensor * src = node->src[j];
+                    if (src == NULL) {
+                        continue;
+                    }
+                    // check if a weight is on a different and incompatible backend
+                    // by starting a new split, the memory of the previously offloaded weights can be reused
+                    if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+                        int src_backend_id = tensor_backend_id(src);
+                        if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
+                            need_new_split = true;
+                            break;
+                        }
+                    }
+                    // check if the split has too many inputs
+                    // FIXME: count the number of inputs instead of only checking when full
+                    if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
+                        const size_t id = hash_id(src);
+                        int src_backend_id = sched->hv_tensor_backend_ids[id];
+                        bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
+                        if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
+                            need_new_split = true;
+                            break;
+                        }
+                    }
+                }
+            }
-static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
-    /* .get_name         = */ lm_ggml_backend_cpu_reg_get_name,
-    /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
-    /* .get_device       = */ lm_ggml_backend_cpu_reg_get_device,
-    /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
-};
+            if (node_backend_id != cur_backend_id || need_new_split) {
+                split->i_end = i;
+                i_split++;
+                if (i_split >= sched->splits_capacity) {
+                    sched->splits_capacity *= 2;
+                    sched->splits = (lm_ggml_backend_sched_split *)
+                        realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
+                    LM_GGML_ASSERT(sched->splits != NULL);
+                }
+                split = &sched->splits[i_split];
+                split->backend_id = node_backend_id;
+                split->i_start = i;
+                split->n_inputs = 0;
+                cur_backend_id = node_backend_id;
+            }
-lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
-    static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
-        /* .iface   = */ lm_ggml_backend_cpu_reg_i,
-        /* .context = */ NULL,
-    };
+            // find inputs that are not on the same backend
+            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
+                struct lm_ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
-    return &lm_ggml_backend_cpu_reg;
-}
+                size_t src_id = hash_id(src);
+                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
+                assert(src_backend_id != -1); // all inputs should be assigned by now
-// multi-buffer buffer
+                if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
+                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
+                        lm_ggml_backend_t backend = sched->backends[src_backend_id];
+                        for (int c = 0; c < sched->n_copies; c++) {
+                            struct lm_ggml_tensor * tensor_copy;
+                            if (c == sched->cur_copy) {
+                                tensor_copy = src; // use the original tensor as the current copy
+                            } else {
+                                tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
+                                lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
+                            }
+                            if (sched->n_copies > 1) {
+                                lm_ggml_set_input(tensor_copy);
+                                lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
+                            tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
+                            SET_CAUSE(tensor_copy, "4.cpy");
+                        }
+                        int n_graph_inputs = sched->n_graph_inputs++;
+                        LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
+                        sched->graph_inputs[n_graph_inputs] = src;
+                    }
+                }
-struct lm_ggml_backend_multi_buffer_context {
-    lm_ggml_backend_buffer_t * buffers;
-    size_t n_buffers;
-};
+                if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
+                    // create a copy of the input in the split's backend
+                    if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
+                        lm_ggml_backend_t backend = sched->backends[cur_backend_id];
+                        for (int c = 0; c < sched->n_copies; c++) {
+                            struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
+                            lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
+                            if (sched->n_copies > 1) {
+                                lm_ggml_set_input(tensor_copy);
+                                lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
+                            tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
+                            SET_CAUSE(tensor_copy, "4.cpy");
+                        }
+                        int n_inputs = split->n_inputs++;
+                        LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
+                        split->inputs[n_inputs] = src;
+                    }
+                    node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
+                }
+            }
+        }
+        split->i_end = graph->n_nodes;
+        sched->n_splits = i_split + 1;
+    }
-static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
-    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
+    if (sched->debug) {
+        lm_ggml_backend_sched_print_assignments(sched, graph);
+    }
-    return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
-}
+    // swap node_backend_ids and leaf _backend_ids with prevs
+    {
+        int * tmp = sched->node_backend_ids;
+        sched->node_backend_ids = sched->prev_node_backend_ids;
+        sched->prev_node_backend_ids = tmp;
-static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
-    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
-    for (size_t i = 0; i < ctx->n_buffers; i++) {
-        lm_ggml_backend_buffer_free(ctx->buffers[i]);
+        tmp = sched->leaf_backend_ids;
+        sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
+        sched->prev_leaf_backend_ids = tmp;
     }
-    free(ctx->buffers);
-    free(ctx);
-}
-static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
-    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
-    for (size_t i = 0; i < ctx->n_buffers; i++) {
-        lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
+    int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
+    if (sched->graph.size < graph_size) {
+        sched->graph.size = graph_size;
+        sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
+        sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
+        LM_GGML_ASSERT(sched->graph.nodes != NULL);
+        LM_GGML_ASSERT(sched->graph.leafs != NULL);
     }
-}
+    sched->graph.n_nodes = 0;
+    sched->graph.n_leafs = 0;
-static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
-    /* .get_name        = */ lm_ggml_backend_multi_buffer_get_name,
-    /* .free_buffer     = */ lm_ggml_backend_multi_buffer_free_buffer,
-    /* .get_base        = */ NULL,
-    /* .init_tensor     = */ NULL,
-    /* .memset_tensor   = */ NULL,
-    /* .set_tensor      = */ NULL,
-    /* .get_tensor      = */ NULL,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ lm_ggml_backend_multi_buffer_clear,
-    /* .reset           = */ NULL,
-};
+    struct lm_ggml_cgraph * graph_copy = &sched->graph;
-lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
-    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
-    ctx->n_buffers = n_buffers;
-    ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct lm_ggml_backend_sched_split * split = &sched->splits[i];
+        split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
-    LM_GGML_ASSERT(ctx->buffers != NULL);
+        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
+        for (int j = 0; j < split->n_inputs; j++) {
+            assert(graph_copy->size > (graph_copy->n_nodes + 1));
-    size_t total_size = 0;
-    for (size_t i = 0; i < n_buffers; i++) {
-        ctx->buffers[i] = buffers[i];
-        total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
-    }
+            struct lm_ggml_tensor * input = split->inputs[j];
+            const size_t input_id = hash_id(input);
+            struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
-    return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
-}
+            // add a dependency to the input source so that it is not freed before the copy is done
+            struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
+            input_dep->src[0] = input;
+            sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
+            graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
-bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
-}
+            // add a dependency to the input copy so that it is allocated at the start of the split
+            sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
+            graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
+        }
-void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
-    LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
-    lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
-    for (size_t i = 0; i < ctx->n_buffers; i++) {
-        lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
+        for (int j = split->i_start; j < split->i_end; j++) {
+            assert(graph_copy->size > graph_copy->n_nodes);
+            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
+            graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
+        }
     }
-}
-// creates a copy of the tensor with the same memory layout
-static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
-    struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
-    for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
-        dup->nb[i] = tensor->nb[i];
+    if (sched->n_copies > 1) {
+        // add input copies as leafs so that they are allocated first
+        for (int i = 0; i < sched->n_graph_inputs; i++) {
+            struct lm_ggml_tensor * input = sched->graph_inputs[i];
+            size_t id = hash_id(input);
+            int backend_id = tensor_backend_id(input);
+            for (int c = 0; c < sched->n_copies; c++) {
+                struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
+                sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
+                assert(graph_copy->size > graph_copy->n_leafs);
+                graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
+            }
+        }
+        for (int i = 0; i < sched->n_splits; i++) {
+            struct lm_ggml_backend_sched_split * split = &sched->splits[i];
+            int backend_id = split->backend_id;
+            for (int j = 0; j < split->n_inputs; j++) {
+                struct lm_ggml_tensor * input = split->inputs[j];
+                size_t id = hash_id(input);
+                for (int c = 0; c < sched->n_copies; c++) {
+                    struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
+                    sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
+                    assert(graph_copy->size > graph_copy->n_leafs);
+                    graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
+                }
+            }
+        }
     }
-    return dup;
-}
-static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
-    return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
+    // add leafs from the original graph
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct lm_ggml_tensor * leaf = graph->leafs[i];
+        sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
+        assert(graph_copy->size > graph_copy->n_leafs);
+        graph_copy->leafs[graph_copy->n_leafs++] = leaf;
+    }
 }
-// scheduler
-#ifndef LM_GGML_SCHED_MAX_BACKENDS
-#define LM_GGML_SCHED_MAX_BACKENDS 16
-#endif
-#ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
-#define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
-#endif
+static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
+    bool backend_ids_changed = false;
+    for (int i = 0; i < sched->graph.n_nodes; i++) {
+        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
+            sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
+            backend_ids_changed = true;
+            break;
+        }
+    }
+    if (!backend_ids_changed) {
+        for (int i = 0; i < sched->graph.n_leafs; i++) {
+            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
+                sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
+                backend_ids_changed = true;
+                break;
+            }
+        }
+    }
-#ifndef LM_GGML_SCHED_MAX_COPIES
-#define LM_GGML_SCHED_MAX_COPIES 4
+    // allocate graph
+    if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+        // the re-allocation may cause the split inputs to be moved to a different address
+        lm_ggml_backend_sched_synchronize(sched);
+#ifndef NDEBUG
+        LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
+        lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
+        if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+            LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+            return false;
+        }
+    }
-struct lm_ggml_backend_sched_split {
-    int backend_id;
-    int i_start;
-    int i_end;
-    struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
-    int n_inputs;
-    // graph view of this split
-    struct lm_ggml_cgraph graph;
-};
-struct lm_ggml_backend_sched {
-    bool is_reset; // true if the scheduler has been reset since the last graph split
-    bool is_alloc;
-    int n_backends;
+    return true;
+}
-    lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
-    lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
-    lm_ggml_gallocr_t galloc;
+static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
+    struct lm_ggml_backend_sched_split * splits = sched->splits;
-    // hash map of the nodes in the graph
-    struct lm_ggml_hash_set  hash_set;
-    int                 * hv_tensor_backend_ids; // [hash_set.size]
-    struct lm_ggml_tensor ** hv_tensor_copies;      // [hash_set.size][n_backends][n_copies]
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct lm_ggml_backend_sched_split * split = &splits[i];
+        int split_backend_id = split->backend_id;
+        lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
-    int * node_backend_ids; // [graph_size]
-    int * leaf_backend_ids; // [graph_size]
+        // copy the input tensors to the split backend
+        for (int j = 0; j < split->n_inputs; j++) {
+            lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
+            struct lm_ggml_tensor * input = split->inputs[j];
+            struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
-    int * prev_node_backend_ids; // [graph_size]
-    int * prev_leaf_backend_ids; // [graph_size]
+            if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
+                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
+                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    lm_ggml_backend_synchronize(split_backend);
+                }
+                lm_ggml_backend_tensor_copy(input, input_cpy);
+            } else {
+                // wait for the split backend to finish using the input before overwriting it
+                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    lm_ggml_backend_synchronize(split_backend);
+                }
+                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
+                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    lm_ggml_backend_synchronize(input_backend);
+                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                        lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                    } else {
+                        lm_ggml_backend_synchronize(split_backend);
+                    }
+                    lm_ggml_backend_tensor_copy(input, input_cpy);
+                }
+            }
+        }
-    // copy of the graph with modified inputs
-    struct lm_ggml_cgraph graph;
+        if (!sched->callback_eval) {
+            enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
+            if (ec != LM_GGML_STATUS_SUCCESS) {
+                return ec;
+            }
+        } else {
+            // similar to lm_ggml_backend_compare_graph_backend
+            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+                struct lm_ggml_tensor * t = split->graph.nodes[j0];
-    // graph splits
-    struct lm_ggml_backend_sched_split * splits;
-    int n_splits;
-    int splits_capacity;
+                // check if the user needs data from this node
+                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
-    // pipeline parallelism support
-    int n_copies;
-    int cur_copy;
-    lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
-    struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
-    int n_graph_inputs;
+                int j1 = j0;
-    struct lm_ggml_context * ctx;
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (!need && j1 < split->graph.n_nodes - 1) {
+                    t = split->graph.nodes[++j1];
+                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+                }
-    lm_ggml_backend_sched_eval_callback callback_eval;
-    void * callback_eval_user_data;
+                struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
-    char * context_buffer;
-    size_t context_buffer_size;
+                enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
+                if (ec != LM_GGML_STATUS_SUCCESS) {
+                    return ec;
+                }
-    bool debug;
-};
+                // TODO: pass backend to the callback, then the user can decide if they want to synchronize
+                lm_ggml_backend_synchronize(split_backend);
-#define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
-#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
-#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
-#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
+                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                    break;
+                }
-// returns the priority of the backend, lower id is higher priority
-static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->backends[i] == backend) {
-            return i;
+                j0 = j1;
+            }
         }
-    }
-    return -1;
-}
-static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
-    lm_ggml_backend_buffer_t buffer = tensor->buffer;
-    if (buffer == NULL) {
-        return -1;
-    }
-    // find highest prio backend that supports the buffer type and the op
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
-            lm_ggml_backend_supports_op(sched->backends[i], op)) {
-            return i;
+        // record the event of this copy
+        if (split->n_inputs > 0) {
+            if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
+            }
         }
     }
-#ifndef NDEBUG
-    fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
-        __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
-#endif
+    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
-    return -1;
+    return LM_GGML_STATUS_SUCCESS;
 }
-#if 0
-#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
-static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
-#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
-#define GET_CAUSE(node) causes[hash_id(node)]
-#else
-#define SET_CAUSE(node, ...)
-#define GET_CAUSE(node) ""
-#endif
+lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
+        lm_ggml_backend_t * backends,
+        lm_ggml_backend_buffer_type_t * bufts,
+        int n_backends,
+        size_t graph_size,
+        bool parallel) {
+    LM_GGML_ASSERT(n_backends > 0);
+    LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
+    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
-// returns the backend that should be used for the node based on the current locations
-static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
-    // TODO: use supports_op to check if the backend supports the op
+    struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
-    // assign pre-allocated nodes to their backend
-    int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
-    if (cur_backend_id != -1) {
-        SET_CAUSE(tensor, "1.dst");
-        return cur_backend_id;
-    }
+    const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
+    sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
+    sched->n_backends = n_backends;
+    sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
-    // view_src
-    if (tensor->view_src != NULL) {
-        cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
-        if (cur_backend_id != -1) {
-            SET_CAUSE(tensor, "1.vsrc");
-            return cur_backend_id;
-        }
-    }
+    // initialize hash table
+    // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
+    sched->hash_set    = lm_ggml_hash_set_new(graph_size);
+    sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+    sched->hv_tensor_copies      = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
-    if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
-        // since the tensor is pre-allocated, it cannot be moved to another backend
-        LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
-    }
+    const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
+    const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
+    sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+    sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+    sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
+    sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
-    // graph input
-    if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
-        cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
-        SET_CAUSE(tensor, "1.inp");
-        return cur_backend_id;
-    }
+    sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
+    sched->context_buffer = (char *) malloc(sched->context_buffer_size);
-    // operations with weights are preferably run on the same backend as the weights
-    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
-        const struct lm_ggml_tensor * src = tensor->src[i];
-        if (src == NULL) {
-            continue;
-        }
-        if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
-            // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1) {
-                for (int b = 0; b < src_backend_id; b++) {
-                    if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
-                        SET_CAUSE(tensor, "1.off");
-                        return b;
-                    }
-                }
+    const int initial_splits_capacity = 16;
+    sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
+    sched->splits_capacity = initial_splits_capacity;
+    for (int b = 0; b < n_backends; b++) {
+        sched->backends[b] = backends[b];
+        sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
+        LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
+        if (sched->n_copies > 1) {
+            for (int c = 0; c < sched->n_copies; c++) {
+                sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
             }
-            SET_CAUSE(tensor, "1.wgt%d", i);
-            return src_backend_id;
         }
     }
-    return -1;
+    sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
+    lm_ggml_backend_sched_reset(sched);
+    return sched;
 }
-static char * fmt_size(size_t size) {
-    static char buffer[128];
-    if (size >= 1024*1024) {
-        snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
-    } else {
-        snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
+void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
+    if (sched == NULL) {
+        return;
     }
-    return buffer;
+    for (int b = 0; b < sched->n_backends; b++) {
+        for (int c = 0; c < sched->n_copies; c++) {
+            lm_ggml_backend_event_free(sched->events[b][c]);
+        }
+    }
+    lm_ggml_gallocr_free(sched->galloc);
+    lm_ggml_free(sched->ctx);
+    lm_ggml_hash_set_free(&sched->hash_set);
+    free(sched->splits);
+    free(sched->hv_tensor_backend_ids);
+    free(sched->hv_tensor_copies);
+    free(sched->node_backend_ids);
+    free(sched->leaf_backend_ids);
+    free(sched->prev_node_backend_ids);
+    free(sched->prev_leaf_backend_ids);
+    free(sched->context_buffer);
+    free(sched->graph.nodes);
+    free(sched->graph.leafs);
+    free(sched);
 }
-static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
-    int cur_split = 0;
-    for (int i = 0; i < graph->n_nodes; i++) {
-        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
-            lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
-                sched->splits[cur_split].n_inputs);
-            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
-                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
-                    fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
-            }
-            fprintf(stderr, "\n");
-            cur_split++;
-        }
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        if (lm_ggml_is_view_op(node->op)) {
-            continue;
-        }
-        lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
-            fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
-        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-            struct lm_ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-            lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
-            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
-                fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
-        }
-        fprintf(stderr, "\n");
+void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
+    // reset state for the next run
+    if (!sched->is_reset) {
+        lm_ggml_hash_set_reset(&sched->hash_set);
+        memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+        memset(sched->hv_tensor_copies,       0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
+        sched->is_reset = true;
     }
+    sched->is_alloc = false;
 }
-static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
-    lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
-    lm_ggml_backend_buffer_type_t buft = NULL;
+bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
+    LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
-    if (buf) {
-        // the tensor is already allocated
-        buft = buf->buft;
-    } else {
-        // see if the tensor already has a backend assigned, and use the buffer type of that backend
-        int tensor_backend_id = tensor_backend_id(t);
-        if (tensor_backend_id == -1 && t->view_src) {
-            tensor_backend_id = tensor_backend_id(t->view_src);
-        }
-        if (tensor_backend_id != -1) {
-            buft = sched->bufts[tensor_backend_id];
-        }
+    lm_ggml_backend_sched_split_graph(sched, measure_graph);
+    if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
+        return false;
     }
-    return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
-}
+    lm_ggml_backend_sched_reset(sched);
+    lm_ggml_backend_sched_synchronize(sched);
-static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
-    if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
-        *node_backend_id = cur_backend_id;
-        SET_CAUSE(node, "2.sup");
-    }
+    return true;
 }
-// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
-    // reset splits
-    sched->n_splits = 0;
-    sched->n_graph_inputs = 0;
-    sched->is_reset = false;
+bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
-    struct lm_ggml_init_params params = {
-        /* .mem_size =   */ sched->context_buffer_size,
-        /* .mem_buffer = */ sched->context_buffer,
-        /* .no_alloc =   */ true
-    };
+    lm_ggml_backend_sched_split_graph(sched, graph);
-    lm_ggml_free(sched->ctx);
-    sched->ctx = lm_ggml_init(params);
-    if (sched->ctx == NULL) {
-        LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
+    if (!lm_ggml_backend_sched_alloc_splits(sched)) {
+        return false;
     }
-    // pass 1: assign backends to ops with pre-allocated inputs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct lm_ggml_tensor * leaf = graph->leafs[i];
-        int * leaf_backend_id = &tensor_backend_id(leaf);
-        // do not overwrite user assignments
-        if (*leaf_backend_id == -1) {
-            *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
-        }
-    }
+    sched->is_alloc = true;
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        int * node_backend_id = &tensor_backend_id(node);
-        // do not overwrite user assignments
-        if (*node_backend_id == -1) {
-            *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
+    return true;
+}
-#if 0
-            // src
-            if (node->op == LM_GGML_OP_NONE) {
-                continue;
-            }
+enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
+    lm_ggml_backend_sched_synchronize(sched);
+    return err;
+}
-            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                struct lm_ggml_tensor * src = node->src[j];
-                if (src == NULL) {
-                    continue;
-                }
-                int * src_backend_id = &tensor_backend_id(src);
-                if (*src_backend_id == -1) {
-                    *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
-                }
-            }
-#endif
-        }
+enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
+    if (!sched->is_reset && !sched->is_alloc) {
+        lm_ggml_backend_sched_reset(sched);
     }
-    // pass 2: expand current backend assignments
-    // assign the same backend to adjacent nodes
-    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
-    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
-    // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
-    // expand gpu down
-    {
-        int cur_backend_id = -1;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct lm_ggml_tensor * node = graph->nodes[i];
-            if (lm_ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int * node_backend_id = &tensor_backend_id(node);
-            if (*node_backend_id != -1) {
-                if (*node_backend_id == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_backend_id = -1;
-                } else {
-                    cur_backend_id = *node_backend_id;
-                }
-            } else if (cur_backend_id != -1) {
-                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
-            }
-        }
-    }
-    // expand gpu up
-    {
-        int cur_backend_id = -1;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
-            struct lm_ggml_tensor * node = graph->nodes[i];
-            if (lm_ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int * node_backend_id = &tensor_backend_id(node);
-            if (*node_backend_id != -1) {
-                if (*node_backend_id == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_backend_id = -1;
-                } else {
-                    cur_backend_id = *node_backend_id;
-                }
-            } else if (cur_backend_id != -1) {
-                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
-            }
-        }
-    }
-    // expand rest down
-    {
-        int cur_backend_id = -1;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct lm_ggml_tensor * node = graph->nodes[i];
-            if (lm_ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int * node_backend_id = &tensor_backend_id(node);
-            if (*node_backend_id != -1) {
-                cur_backend_id = *node_backend_id;
-            } else if (cur_backend_id != -1) {
-                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
-            }
-        }
-    }
-    // expand rest up
-    {
-        int cur_backend_id = -1;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
-            struct lm_ggml_tensor * node = graph->nodes[i];
-            if (lm_ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int * node_backend_id = &tensor_backend_id(node);
-            if (*node_backend_id != -1) {
-                cur_backend_id = *node_backend_id;
-            } else if (cur_backend_id != -1) {
-                lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
-            }
+    if (!sched->is_alloc) {
+        if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
+            return LM_GGML_STATUS_ALLOC_FAILED;
         }
     }
-    // pass 3: upgrade nodes to higher prio backends with compatible buffer types
-    // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
-    // however, we also need to verify that the sources are in compatible buffer types
-    // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
-    // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
-    // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
-    // additionally, set remaining unassigned nodes to the backend with the most supported inputs
-    // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        if (lm_ggml_is_view_op(node->op)) {
-            continue;
-        }
-        int * node_backend_id = &tensor_backend_id(node);
-        if (*node_backend_id == -1) {
-            // unassigned node: find the backend with the most supported inputs
-            int n_supported_best = -1;
-            for (int b = 0; b < sched->n_backends; b++) {
-                if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
-                    int n_supported = 0;
-                    for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                        struct lm_ggml_tensor * src = node->src[j];
-                        if (src == NULL) {
-                            continue;
-                        }
-                        if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
-                            n_supported++;
-                        }
-                    }
-                    if (n_supported > n_supported_best) {
-                        n_supported_best = n_supported;
-                        *node_backend_id = b;
-                        SET_CAUSE(node, "3.best");
-                    }
-                }
-            }
-        } else {
-            // assigned node: upgrade to higher prio backend if possible
-            for (int b = 0; b < *node_backend_id; b++) {
-                if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
-                    bool supported = true;
-                    for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                        struct lm_ggml_tensor * src = node->src[j];
-                        if (src == NULL) {
-                            continue;
-                        }
-                        if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
-                            supported = false;
-                            break;
-                        }
-                    }
-                    if (supported) {
-                        *node_backend_id = b;
-                        SET_CAUSE(node, "3.upg");
-                        break;
-                    }
-                }
-            }
-        }
+    return lm_ggml_backend_sched_compute_splits(sched);
+}
+void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        lm_ggml_backend_synchronize(sched->backends[i]);
     }
+}
+void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
+    sched->callback_eval = callback;
+    sched->callback_eval_user_data = user_data;
+}
+int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
+    return sched->n_splits;
+}
+int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
+    return sched->n_copies;
+}
+int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
+    return sched->n_backends;
+}
+lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
+    LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
+    return sched->backends[i];
+}
+size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
+    int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
+    LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+}
+void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
+    int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
+    LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    tensor_backend_id(node) = backend_index;
+    SET_CAUSE(node, "usr");
+    sched->is_reset = false;
+}
-    // pass 4: assign backends to remaining src from dst and view_src
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        int * cur_backend_id = &tensor_backend_id(node);
-        if (node->view_src != NULL && *cur_backend_id == -1) {
-            *cur_backend_id = tensor_backend_id(node->view_src);
-            SET_CAUSE(node, "4.vsrc");
-        }
-        for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-            struct lm_ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-            int * src_backend_id = &tensor_backend_id(src);
-            if (*src_backend_id == -1) {
-                if (src->view_src != NULL) {
-                    // views are always on the same backend as the source
-                    *src_backend_id = tensor_backend_id(src->view_src);
-                    SET_CAUSE(src, "4.vsrc");
-                } else {
-                    *src_backend_id = *cur_backend_id;
-                    SET_CAUSE(src, "4.cur");
-                }
-            }
-        }
+lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
+    int backend_index = tensor_backend_id(node);
+    if (backend_index == -1) {
+        return NULL;
     }
+    return sched->backends[backend_index];
+}
-    // pass 5: split graph, find tensors that need to be copied
-    {
-        int i_split = 0;
-        struct lm_ggml_backend_sched_split * split = &sched->splits[0];
-        // find the backend of the first split, skipping view ops
-        int i = 0;
-        for (; i < graph->n_nodes; i++) {
-            struct lm_ggml_tensor * node = graph->nodes[i];
-            if (!lm_ggml_is_view_op(node->op)) {
-                split->backend_id = tensor_backend_id(node);
-                break;
-            }
-        }
-        split->i_start = 0;
-        split->n_inputs = 0;
-        int cur_backend_id = split->backend_id;
-        for (; i < graph->n_nodes; i++) {
-            struct lm_ggml_tensor * node = graph->nodes[i];
+// utils
-            if (lm_ggml_is_view_op(node->op)) {
-                continue;
-            }
+void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
+    LM_GGML_ASSERT(tensor->buffer == NULL);
+    LM_GGML_ASSERT(tensor->view_src != NULL);
+    LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
+    LM_GGML_ASSERT(tensor->view_src->data != NULL);
-            const int node_backend_id = tensor_backend_id(node);
+    tensor->buffer = tensor->view_src->buffer;
+    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
+    lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
+}
-            assert(node_backend_id != -1); // all nodes should be assigned by now
+void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
+    LM_GGML_ASSERT(tensor->buffer == NULL);
+    LM_GGML_ASSERT(tensor->data == NULL);
+    LM_GGML_ASSERT(tensor->view_src == NULL);
+    LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
+    LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
+                (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
-            // check if we should start a new split based on the sources of the current node
-            bool need_new_split = false;
-            if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
-                for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                    struct lm_ggml_tensor * src = node->src[j];
-                    if (src == NULL) {
-                        continue;
-                    }
-                    // check if a weight is on a different backend
-                    // by starting a new split, the memory of the previously offloaded weights can be reused
-                    if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-                        int src_backend_id = tensor_backend_id(src);
-                        if (src_backend_id != cur_backend_id) {
-                            need_new_split = true;
-                            break;
-                        }
-                    }
-                    // check if the split has too many inputs
-                    // FIXME: count the number of inputs instead of only checking when full
-                    if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
-                        const size_t id = hash_id(src);
-                        int src_backend_id = sched->hv_tensor_backend_ids[id];
-                        bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
-                        if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
-                            //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
-                            need_new_split = true;
-                            break;
-                        }
-                    }
-                }
-            }
+    tensor->buffer = buffer;
+    tensor->data = addr;
+    lm_ggml_backend_buffer_init_tensor(buffer, tensor);
+}
-            if (node_backend_id != cur_backend_id || need_new_split) {
-                split->i_end = i;
-                i_split++;
-                if (i_split >= sched->splits_capacity) {
-                    sched->splits_capacity *= 2;
-                    sched->splits = (lm_ggml_backend_sched_split *)
-                        realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
-                    LM_GGML_ASSERT(sched->splits != NULL);
-                }
-                split = &sched->splits[i_split];
-                split->backend_id = node_backend_id;
-                split->i_start = i;
-                split->n_inputs = 0;
-                cur_backend_id = node_backend_id;
-            }
+static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
+    struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
-            // find inputs that are not on the same backend
-            for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
-                struct lm_ggml_tensor * src = node->src[j];
-                if (src == NULL) {
-                    continue;
-                }
+    LM_GGML_ASSERT(src != NULL);
+    LM_GGML_ASSERT(src->data && "graph must be allocated");
-                size_t src_id = hash_id(src);
-                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
-                assert(src_backend_id != -1); // all inputs should be assigned by now
+    size_t id = lm_ggml_hash_insert(&hash_set, src);
+    if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
+        return node_copies[lm_ggml_hash_find(&hash_set, src)];
+    }
-                if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
-                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
-                        lm_ggml_backend_t backend = sched->backends[src_backend_id];
-                        for (int c = 0; c < sched->n_copies; c++) {
-                            struct lm_ggml_tensor * tensor_copy;
-                            if (c == sched->cur_copy) {
-                                tensor_copy = src; // use the original tensor as the current copy
-                            } else {
-                                tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
-                                lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
-                            }
-                            if (sched->n_copies > 1) {
-                                lm_ggml_set_input(tensor_copy);
-                                lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
-                            }
-                            tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
-                            SET_CAUSE(tensor_copy, "4.cpy");
-                        }
-                        int n_graph_inputs = sched->n_graph_inputs++;
-                        LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
-                        sched->graph_inputs[n_graph_inputs] = src;
-                    }
-                }
+    struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
+    if (src->view_src != NULL) {
+        dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
+        dst->view_offs = src->view_offs;
+    }
+    dst->op = src->op;
+    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
+    lm_ggml_set_name(dst, src->name);
-                if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
-                    // create a copy of the input in the split's backend
-                    if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
-                        lm_ggml_backend_t backend = sched->backends[cur_backend_id];
-                        for (int c = 0; c < sched->n_copies; c++) {
-                            struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
-                            lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
-                            if (sched->n_copies > 1) {
-                                lm_ggml_set_input(tensor_copy);
-                                lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
-                            }
-                            tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
-                            SET_CAUSE(tensor_copy, "4.cpy");
-                        }
-                        int n_inputs = split->n_inputs++;
-                        LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
-                        split->inputs[n_inputs] = src;
-                    }
-                    node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
-                }
-            }
+    // copy src
+    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
+        struct lm_ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            continue;
         }
-        split->i_end = graph->n_nodes;
-        sched->n_splits = i_split + 1;
+        dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
+    }
+    node_copies[id] = dst;
+    return dst;
+}
+static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
+    size_t id = lm_ggml_hash_find(hash_set, src);
+    if (node_init[id]) {
+        return;
+    }
+    node_init[id] = true;
+    struct lm_ggml_tensor * dst = node_copies[id];
+    if (dst->view_src != NULL) {
+        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
+        lm_ggml_backend_view_init(dst);
+    }
+    else {
+        lm_ggml_backend_tensor_copy(src, dst);
+    }
+    // init src
+    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
+        struct lm_ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            continue;
+        }
+        graph_copy_init_tensor(hash_set, node_copies, node_init, s);
+    }
+}
+struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
+    struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
+    struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
+    bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
+    struct lm_ggml_init_params params = {
+        /* .mem_size   = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true
+    };
+    struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
+    struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
+    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
+        LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
+        lm_ggml_hash_set_free(&hash_set);
+        free(node_copies);
+        free(node_init);
+        lm_ggml_free(ctx_allocated);
+        lm_ggml_free(ctx_unallocated);
+        return {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
+    // dup nodes
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
     }
-    if (sched->debug) {
-        lm_ggml_backend_sched_print_assignments(sched, graph);
+    // allocate nodes
+    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+    if (buffer == NULL) {
+        LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
+        lm_ggml_hash_set_free(&hash_set);
+        free(node_copies);
+        free(node_init);
+        lm_ggml_free(ctx_allocated);
+        lm_ggml_free(ctx_unallocated);
+        return {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
     }
-    // swap node_backend_ids and leaf _backend_ids with prevs
-    {
-        int * tmp = sched->node_backend_ids;
-        sched->node_backend_ids = sched->prev_node_backend_ids;
-        sched->prev_node_backend_ids = tmp;
+    //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
-        tmp = sched->leaf_backend_ids;
-        sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
-        sched->prev_leaf_backend_ids = tmp;
+    // copy data and init views
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
     }
-    int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
-    if (sched->graph.size < graph_size) {
-        sched->graph.size = graph_size;
-        sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
-        sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
-        LM_GGML_ASSERT(sched->graph.nodes != NULL);
-        LM_GGML_ASSERT(sched->graph.leafs != NULL);
+    // build graph copy
+    struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct lm_ggml_tensor * node = graph->nodes[i];
+        struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
+        graph_copy->nodes[i] = node_copy;
     }
-    sched->graph.n_nodes = 0;
-    sched->graph.n_leafs = 0;
+    graph_copy->n_nodes = graph->n_nodes;
-    struct lm_ggml_cgraph * graph_copy = &sched->graph;
+    lm_ggml_hash_set_free(&hash_set);
+    free(node_copies);
+    free(node_init);
-    for (int i = 0; i < sched->n_splits; i++) {
-        struct lm_ggml_backend_sched_split * split = &sched->splits[i];
-        split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
+    return {
+        /* .buffer           = */ buffer,
+        /* .ctx_allocated    = */ ctx_allocated,
+        /* .ctx_unallocated  = */ ctx_unallocated,
+        /* .graph            = */ graph_copy,
+    };
+}
-        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
-        for (int j = 0; j < split->n_inputs; j++) {
-            assert(graph_copy->size > (graph_copy->n_nodes + 1));
+void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
+    lm_ggml_backend_buffer_free(copy.buffer);
+    lm_ggml_free(copy.ctx_allocated);
+    lm_ggml_free(copy.ctx_unallocated);
+}
-            struct lm_ggml_tensor * input = split->inputs[j];
-            const size_t input_id = hash_id(input);
-            struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
+bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
+    struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
+    if (copy.buffer == NULL) {
+        return false;
+    }
-            // add a dependency to the input source so that it is not freed before the copy is done
-            struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
-            input_dep->src[0] = input;
-            sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
-            graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
+    struct lm_ggml_cgraph * g1 = graph;
+    struct lm_ggml_cgraph * g2 = copy.graph;
-            // add a dependency to the input copy so that it is allocated at the start of the split
-            sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
-            graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
-        }
+    assert(g1->n_nodes == g2->n_nodes);
-        for (int j = split->i_start; j < split->i_end; j++) {
-            assert(graph_copy->size > graph_copy->n_nodes);
-            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
-            graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
-        }
-    }
+    for (int i = 0; i < g1->n_nodes; i++) {
+        //printf("eval %d/%d\n", i, g1->n_nodes);
+        struct lm_ggml_tensor * t1 = g1->nodes[i];
+        struct lm_ggml_tensor * t2 = g2->nodes[i];
-    if (sched->n_copies > 1) {
-        // add input copies as leafs so that they are allocated first
-        for (int i = 0; i < sched->n_graph_inputs; i++) {
-            struct lm_ggml_tensor * input = sched->graph_inputs[i];
-            size_t id = hash_id(input);
-            int backend_id = tensor_backend_id(input);
-            for (int c = 0; c < sched->n_copies; c++) {
-                struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
-                sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
-                assert(graph_copy->size > graph_copy->n_leafs);
-                graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
-            }
+        assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
+        struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
+        struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
+        lm_ggml_backend_graph_compute(backend1, &g1v);
+        lm_ggml_backend_graph_compute(backend2, &g2v);
+        if (lm_ggml_is_view_op(t1->op)) {
+            continue;
         }
-        for (int i = 0; i < sched->n_splits; i++) {
-            struct lm_ggml_backend_sched_split * split = &sched->splits[i];
-            int backend_id = split->backend_id;
-            for (int j = 0; j < split->n_inputs; j++) {
-                struct lm_ggml_tensor * input = split->inputs[j];
-                size_t id = hash_id(input);
-                for (int c = 0; c < sched->n_copies; c++) {
-                    struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
-                    sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
-                    assert(graph_copy->size > graph_copy->n_leafs);
-                    graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
-                }
-            }
+        // compare results, calculate rms etc
+        if (!callback(i, t1, t2, user_data)) {
+            break;
         }
     }
-    // add leafs from the original graph
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct lm_ggml_tensor * leaf = graph->leafs[i];
-        sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
-        assert(graph_copy->size > graph_copy->n_leafs);
-        graph_copy->leafs[graph_copy->n_leafs++] = leaf;
-    }
+    lm_ggml_backend_graph_copy_free(copy);
+    return true;
 }
-static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
-    bool backend_ids_changed = false;
-    for (int i = 0; i < sched->graph.n_nodes; i++) {
-        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
-            sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
-            backend_ids_changed = true;
-            break;
-        }
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include <cctype>
+#include <string>
+// ggml-backend interface
+// CPU backend - buffer
+static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
+    uintptr_t data = (uintptr_t)buffer->context;
+    // align the buffer
+    if (data % TENSOR_ALIGNMENT != 0) {
+        data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
     }
-    if (!backend_ids_changed) {
-        for (int i = 0; i < sched->graph.n_leafs; i++) {
-            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
-                sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
-                backend_ids_changed = true;
-                break;
-            }
-        }
+    return (void *)data;
+}
+static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
+    lm_ggml_aligned_free(buffer->context, buffer->size);
+}
+static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+    LM_GGML_UNUSED(buffer);
+}
+static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    memcpy((char *)tensor->data + offset, data, size);
+    LM_GGML_UNUSED(buffer);
+}
+static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+    LM_GGML_UNUSED(buffer);
+}
+static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
+    if (lm_ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, lm_ggml_nbytes(src));
+        return true;
     }
+    return false;
-    // allocate graph
-    if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
-        // the re-allocation may cause the split inputs to be moved to a different address
-        lm_ggml_backend_sched_synchronize(sched);
-#ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
-#endif
-        lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
-        if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
-            fprintf(stderr, "%s: failed to allocate graph\n", __func__);
-            return false;
-        }
+    LM_GGML_UNUSED(buffer);
+}
+static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
+    /* .free_buffer     = */ lm_ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base        = */ lm_ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ lm_ggml_backend_cpu_buffer_memset_tensor,
+    /* .set_tensor      = */ lm_ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ lm_ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ lm_ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
+    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base        = */ lm_ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ lm_ggml_backend_cpu_buffer_memset_tensor,
+    /* .set_tensor      = */ lm_ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ lm_ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ lm_ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+// CPU backend - buffer type
+static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
+    return "CPU";
+    LM_GGML_UNUSED(buft);
+}
+static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = lm_ggml_aligned_malloc(size);
+    if (data == NULL) {
+        LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
     }
-    return true;
+    return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
 }
-static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
-    struct lm_ggml_backend_sched_split * splits = sched->splits;
+static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
-    for (int i = 0; i < sched->n_splits; i++) {
-        struct lm_ggml_backend_sched_split * split = &splits[i];
-        int split_backend_id = split->backend_id;
-        lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
+    LM_GGML_UNUSED(buft);
+}
-        // copy the input tensors to the split backend
-        for (int j = 0; j < split->n_inputs; j++) {
-            lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
-            struct lm_ggml_tensor * input = split->inputs[j];
-            struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
+static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
+    return true;
-            if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
-                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
-                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
-                    lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
-                } else {
-                    lm_ggml_backend_synchronize(split_backend);
-                }
-                lm_ggml_backend_tensor_copy(input, input_cpy);
-            } else {
-                // wait for the split backend to finish using the input before overwriting it
-                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
-                    lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
-                } else {
-                    lm_ggml_backend_synchronize(split_backend);
-                }
-                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
-                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
-                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
-                    lm_ggml_backend_synchronize(input_backend);
-                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
-                        lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
-                    } else {
-                        lm_ggml_backend_synchronize(split_backend);
-                    }
-                    lm_ggml_backend_tensor_copy(input, input_cpy);
-                }
-            }
-        }
+    LM_GGML_UNUSED(buft);
+}
-        if (!sched->callback_eval) {
-            enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
-            if (ec != LM_GGML_STATUS_SUCCESS) {
-                return ec;
-            }
-        } else {
-            // similar to lm_ggml_backend_compare_graph_backend
-            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
-                struct lm_ggml_tensor * t = split->graph.nodes[j0];
+lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
+    static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ lm_ggml_backend_cpu_buffer_type_get_name,
+            /* .alloc_buffer     = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
+            /* .is_host          = */ lm_ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
-                // check if the user needs data from this node
-                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+    return &lm_ggml_backend_cpu_buffer_type;
+}
-                int j1 = j0;
+static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
+    return "CPU_Mapped";
-                // determine the range [j0, j1] of nodes that can be computed together
-                while (!need && j1 < split->graph.n_nodes - 1) {
-                    t = split->graph.nodes[++j1];
-                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
-                }
+    LM_GGML_UNUSED(buft);
+}
-                struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
+static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
+    static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
+            /* .alloc_buffer     = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
+            /* .is_host          = */ lm_ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
-                enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
-                if (ec != LM_GGML_STATUS_SUCCESS) {
-                    return ec;
-                }
+    return &lm_ggml_backend_cpu_buffer_type;
+}
-                // TODO: pass backend to the callback, then the user can decide if they want to synchronize
-                lm_ggml_backend_synchronize(split_backend);
+#ifdef LM_GGML_USE_CPU_HBM
-                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
-                    break;
-                }
+// buffer type HBM
-                j0 = j1;
-            }
-        }
+#include <hbwmalloc.h>
-        // record the event of this copy
-        if (split->n_inputs > 0) {
-            if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
-                lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
-            }
-        }
+static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+    LM_GGML_UNUSED(buft);
+}
+static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr;
+    int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
     }
-    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
+    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
-    return LM_GGML_STATUS_SUCCESS;
+    return buffer;
 }
-lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
-        lm_ggml_backend_t * backends,
-        lm_ggml_backend_buffer_type_t * bufts,
-        int n_backends,
-        size_t graph_size,
-        bool parallel) {
-    LM_GGML_ASSERT(n_backends > 0);
-    LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
-    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
+lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+            /* .get_name         = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
+            /* .alloc_buffer     = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to lm_ggml_nbytes
+            /* .is_host          = */ lm_ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .context  = */ NULL,
+    };
-    struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
+    return &lm_ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
-    sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
-    sched->n_backends = n_backends;
-    sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
+static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
+    static lm_ggml_backend_buffer_type_t bufts[] = {
+#ifdef LM_GGML_USE_CPU_HBM
+        lm_ggml_backend_cpu_hbm_buffer_type(),
+#endif
+        NULL
+    };
-    // initialize hash table
-    // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
-    sched->hash_set    = lm_ggml_hash_set_new(graph_size);
-    sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
-    sched->hv_tensor_copies      = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
+    return bufts;
-    const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
-    const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
-    sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
-    sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
-    sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
-    sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
+    LM_GGML_UNUSED(device);
+}
-    sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
-    sched->context_buffer = (char *) malloc(sched->context_buffer_size);
+// CPU backend - backend (stream)
-    const int initial_splits_capacity = 16;
-    sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
-    sched->splits_capacity = initial_splits_capacity;
+struct lm_ggml_backend_cpu_context {
+    int                 n_threads;
+    lm_ggml_threadpool_t   threadpool;
-    for (int b = 0; b < n_backends; b++) {
-        sched->backends[b] = backends[b];
-        sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
-        LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
-        if (sched->n_copies > 1) {
-            for (int c = 0; c < sched->n_copies; c++) {
-                sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
-            }
-        }
-    }
+    uint8_t *           work_data;
+    size_t              work_size;
-    sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
+    lm_ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
-    lm_ggml_backend_sched_reset(sched);
+static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
+    return "CPU";
-    return sched;
+    LM_GGML_UNUSED(backend);
 }
-void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
-    if (sched == NULL) {
-        return;
-    }
-    for (int b = 0; b < sched->n_backends; b++) {
-        for (int c = 0; c < sched->n_copies; c++) {
-            lm_ggml_backend_event_free(sched->events[b][c]);
-        }
-    }
-    lm_ggml_gallocr_free(sched->galloc);
-    lm_ggml_free(sched->ctx);
-    lm_ggml_hash_set_free(&sched->hash_set);
-    free(sched->splits);
-    free(sched->hv_tensor_backend_ids);
-    free(sched->hv_tensor_copies);
-    free(sched->node_backend_ids);
-    free(sched->leaf_backend_ids);
-    free(sched->prev_node_backend_ids);
-    free(sched->prev_leaf_backend_ids);
-    free(sched->context_buffer);
-    free(sched->graph.nodes);
-    free(sched->graph.leafs);
-    free(sched);
+static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
+    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
+    delete[] cpu_ctx->work_data;
+    delete cpu_ctx;
+    delete backend;
 }
-void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
-    // reset state for the next run
-    if (!sched->is_reset) {
-        lm_ggml_hash_set_reset(&sched->hash_set);
-        memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
-        memset(sched->hv_tensor_copies,       0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
-        sched->is_reset = true;
-    }
-    sched->is_alloc = false;
-}
+struct lm_ggml_backend_plan_cpu {
+    struct lm_ggml_cplan cplan;
+    struct lm_ggml_cgraph cgraph;
+};
-bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
-    LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
+static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
+    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
-    lm_ggml_backend_sched_split_graph(sched, measure_graph);
+    struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
-    if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
-        return false;
+    cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
+        if (cpu_plan->cplan.work_data == NULL) {
+            delete cpu_plan;
+            return NULL;
+        }
     }
-    lm_ggml_backend_sched_reset(sched);
-    lm_ggml_backend_sched_synchronize(sched);
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-    return true;
+    return cpu_plan;
 }
-bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
-    LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
+static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
+    struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
-    lm_ggml_backend_sched_split_graph(sched, graph);
+    delete[] cpu_plan->cplan.work_data;
+    delete cpu_plan;
+    LM_GGML_UNUSED(backend);
+}
-    if (!lm_ggml_backend_sched_alloc_splits(sched)) {
-        return false;
-    }
+static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
+    struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
-    sched->is_alloc = true;
+    return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-    return true;
+    LM_GGML_UNUSED(backend);
 }
-enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
-    enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
-    lm_ggml_backend_sched_synchronize(sched);
-    return err;
-}
+static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
+    struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
-enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
-    if (!sched->is_reset && !sched->is_alloc) {
-        lm_ggml_backend_sched_reset(sched);
-    }
+    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-    if (!sched->is_alloc) {
-        if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
+    if (cpu_ctx->work_size < cplan.work_size) {
+        delete[] cpu_ctx->work_data;
+        cpu_ctx->work_data = new uint8_t[cplan.work_size];
+        if (cpu_ctx->work_data == NULL) {
+            cpu_ctx->work_size = 0;
             return LM_GGML_STATUS_ALLOC_FAILED;
         }
+        cpu_ctx->work_size = cplan.work_size;
     }
+    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-    return lm_ggml_backend_sched_compute_splits(sched);
-}
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        lm_ggml_backend_synchronize(sched->backends[i]);
-    }
+    return lm_ggml_graph_compute(cgraph, &cplan);
 }
-void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
-    sched->callback_eval = callback;
-    sched->callback_eval_user_data = user_data;
-}
+static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
+    /* .get_name                = */ lm_ggml_backend_cpu_get_name,
+    /* .free                    = */ lm_ggml_backend_cpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ lm_ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ lm_ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ lm_ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ lm_ggml_backend_cpu_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
-int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
-    return sched->n_splits;
+static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
+    static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
+    return &guid;
 }
-int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
-    return sched->n_copies;
-}
+lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
+    // initialize CPU backend now to avoid slowing the first graph computation
+    lm_ggml_cpu_init();
-int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
-    return sched->n_backends;
+    struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
+    if (ctx == NULL) {
+        return NULL;
+    }
+    ctx->n_threads           = LM_GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
+    lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
+        /* .guid      = */ lm_ggml_backend_cpu_guid(),
+        /* .interface = */ lm_ggml_backend_cpu_i,
+        /* .device    = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
+        /* .context   = */ ctx,
+    };
+    if (cpu_backend == NULL) {
+        delete ctx;
+        return NULL;
+    }
+    return cpu_backend;
 }
-lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
-    LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
-    return sched->backends[i];
+bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
+    return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
 }
-size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
-    int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
-    LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
+    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
-    return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+    struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
 }
-void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
-    int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
-    LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    tensor_backend_id(node) = backend_index;
-    SET_CAUSE(node, "usr");
-    sched->is_reset = false;
-}
+void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
+    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
-lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
-    int backend_index = tensor_backend_id(node);
-    if (backend_index == -1) {
-        return NULL;
+    struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        lm_ggml_threadpool_pause(ctx->threadpool);
     }
-    return sched->backends[backend_index];
+    ctx->threadpool = threadpool;
 }
-// utils
+void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
+    LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
-void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
-    LM_GGML_ASSERT(tensor->buffer == NULL);
-    LM_GGML_ASSERT(tensor->view_src != NULL);
-    LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
-    LM_GGML_ASSERT(tensor->view_src->data != NULL);
+    struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
-    tensor->buffer = tensor->view_src->buffer;
-    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
+lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+    LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
+    return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
 }
-void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
-    LM_GGML_ASSERT(tensor->buffer == NULL);
-    LM_GGML_ASSERT(tensor->data == NULL);
-    LM_GGML_ASSERT(tensor->view_src == NULL);
-    LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
-    LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
-                (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
+// CPU backend - device
-    tensor->buffer = buffer;
-    tensor->data = addr;
-    lm_ggml_backend_buffer_init_tensor(buffer, tensor);
+struct lm_ggml_backend_cpu_device_context {
+    std::string description = "CPU";
+    lm_ggml_backend_cpu_device_context() {
+#ifdef __APPLE__
+        size_t len = 0;
+        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
+            description.resize(len);
+            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
+        }
+#elif defined(__linux__)
+        FILE * f = fopen("/proc/cpuinfo", "r");
+        if (f) {
+            char buf[1024];
+            while (fgets(buf, sizeof(buf), f)) {
+                if (strncmp(buf, "model name", 10) == 0) {
+                    char * p = strchr(buf, ':');
+                    if (p) {
+                        p++;
+                        while (std::isspace(*p)) {
+                            p++;
+                        }
+                        while (std::isspace(p[strlen(p) - 1])) {
+                            p[strlen(p) - 1] = '\0';
+                        }
+                        description = p;
+                        break;
+                    }
+                }
+            }
+            fclose(f);
+        }
+#elif defined(_WIN32)
+        HKEY hKey;
+        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
+                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
+                        0,
+                        KEY_READ,
+                        &hKey) == ERROR_SUCCESS) {
+            DWORD cpu_brand_size = 0;
+            if (RegQueryValueExA(hKey,
+                                TEXT("ProcessorNameString"),
+                                NULL,
+                                NULL,
+                                NULL,
+                                &cpu_brand_size) == ERROR_SUCCESS) {
+                description.resize(cpu_brand_size);
+                if (RegQueryValueExA(hKey,
+                                    TEXT("ProcessorNameString"),
+                                    NULL,
+                                    NULL,
+                                    (LPBYTE)&description[0], // NOLINT
+                                    &cpu_brand_size) == ERROR_SUCCESS) {
+                    if (description.find('\0') != std::string::npos) {
+                        description.resize(description.find('\0'));
+                    }
+                }
+            }
+            RegCloseKey(hKey);
+        }
+#endif
+    }
+};
+static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
+    return "CPU";
+    LM_GGML_UNUSED(dev);
 }
-static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
-    struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
+static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
+    struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
-    LM_GGML_ASSERT(src != NULL);
-    LM_GGML_ASSERT(src->data && "graph must be allocated");
+    return ctx->description.c_str();
+}
-    size_t id = lm_ggml_hash_insert(&hash_set, src);
-    if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
-        return node_copies[lm_ggml_hash_find(&hash_set, src)];
-    }
+static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
-    struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
-    if (src->view_src != NULL) {
-        dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
-        dst->view_offs = src->view_offs;
-    }
-    dst->op = src->op;
-    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
-    lm_ggml_set_name(dst, src->name);
+    LM_GGML_UNUSED(dev);
+}
-    // copy src
-    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
-        struct lm_ggml_tensor * s = src->src[i];
-        if (s == NULL) {
-            continue;
-        }
-        dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
-    }
+static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
+    return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
-    node_copies[id] = dst;
-    return dst;
+    LM_GGML_UNUSED(dev);
 }
-static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
-    size_t id = lm_ggml_hash_find(hash_set, src);
-    if (node_init[id]) {
-        return;
-    }
-    node_init[id] = true;
+static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
+    props->name        = lm_ggml_backend_cpu_device_get_name(dev);
+    props->description = lm_ggml_backend_cpu_device_get_description(dev);
+    props->type        = lm_ggml_backend_cpu_device_get_type(dev);
+    lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
-    struct lm_ggml_tensor * dst = node_copies[id];
-    if (dst->view_src != NULL) {
-        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
-        lm_ggml_backend_view_init(dst);
-    }
-    else {
-        lm_ggml_backend_tensor_copy(src, dst);
-    }
+static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
+    return lm_ggml_backend_cpu_init();
-    // init src
-    for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
-        struct lm_ggml_tensor * s = src->src[i];
-        if (s == NULL) {
-            continue;
-        }
-        graph_copy_init_tensor(hash_set, node_copies, node_init, s);
-    }
+    LM_GGML_UNUSED(dev);
+    LM_GGML_UNUSED(params);
 }
-struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
-    struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
-    struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
-    bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
+static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
+    return lm_ggml_backend_cpu_buffer_type();
-    struct lm_ggml_init_params params = {
-        /* .mem_size   = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true
-    };
+    LM_GGML_UNUSED(dev);
+}
-    struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
-    struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
+static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
-        fprintf(stderr, "failed to allocate context for graph copy\n");
-        lm_ggml_hash_set_free(&hash_set);
-        free(node_copies);
-        free(node_init);
-        lm_ggml_free(ctx_allocated);
-        lm_ggml_free(ctx_unallocated);
-        return {
-            /* .buffer           = */ NULL,
-            /* .ctx_allocated    = */ NULL,
-            /* .ctx_unallocated  = */ NULL,
-            /* .graph            = */ NULL,
-        };
-    }
+    LM_GGML_UNUSED(dev);
+    LM_GGML_UNUSED(max_tensor_size);
+}
-    // dup nodes
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
+static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
+    switch (op->op) {
+        case LM_GGML_OP_CPY:
+            return
+                op->type != LM_GGML_TYPE_IQ2_XXS &&
+                op->type != LM_GGML_TYPE_IQ2_XS  &&
+                op->type != LM_GGML_TYPE_IQ1_S   &&
+                op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
+        case LM_GGML_OP_MUL_MAT:
+            return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
+        case LM_GGML_OP_ROPE_BACK:
+            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
+        case LM_GGML_OP_IM2COL_BACK:
+            return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
+        case LM_GGML_OP_OUT_PROD:
+            return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
+        default:
+            return true;
     }
-    // allocate nodes
-    lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
-    if (buffer == NULL) {
-        fprintf(stderr, "failed to allocate buffer for graph copy\n");
-        lm_ggml_hash_set_free(&hash_set);
-        free(node_copies);
-        free(node_init);
-        lm_ggml_free(ctx_allocated);
-        lm_ggml_free(ctx_unallocated);
-        return {
-            /* .buffer           = */ NULL,
-            /* .ctx_allocated    = */ NULL,
-            /* .ctx_unallocated  = */ NULL,
-            /* .graph            = */ NULL,
-        };
-    }
+    LM_GGML_UNUSED(dev);
+}
-    //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
+static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
+    return lm_ggml_backend_buft_is_host(buft);
-    // copy data and init views
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
-    }
+    LM_GGML_UNUSED(dev);
+}
-    // build graph copy
-    struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct lm_ggml_tensor * node = graph->nodes[i];
-        struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
-        graph_copy->nodes[i] = node_copy;
-    }
-    graph_copy->n_nodes = graph->n_nodes;
+static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
+    /* .get_name             = */ lm_ggml_backend_cpu_device_get_name,
+    /* .get_description      = */ lm_ggml_backend_cpu_device_get_description,
+    /* .get_memory           = */ lm_ggml_backend_cpu_device_get_memory,
+    /* .get_type             = */ lm_ggml_backend_cpu_device_get_type,
+    /* .get_props            = */ lm_ggml_backend_cpu_device_get_props,
+    /* .init_backend         = */ lm_ggml_backend_cpu_device_init_backend,
+    /* .get_buffer_type      = */ lm_ggml_backend_cpu_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
+    /* .supports_op          = */ lm_ggml_backend_cpu_device_supports_op,
+    /* .supports_buft        = */ lm_ggml_backend_cpu_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
-    lm_ggml_hash_set_free(&hash_set);
-    free(node_copies);
-    free(node_init);
+// CPU backend - backend (reg)
-    return {
-        /* .buffer           = */ buffer,
-        /* .ctx_allocated    = */ ctx_allocated,
-        /* .ctx_unallocated  = */ ctx_unallocated,
-        /* .graph            = */ graph_copy,
-    };
-}
+static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
+    return "CPU";
-void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
-    lm_ggml_backend_buffer_free(copy.buffer);
-    lm_ggml_free(copy.ctx_allocated);
-    lm_ggml_free(copy.ctx_unallocated);
+    LM_GGML_UNUSED(reg);
 }
-bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
-    struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
-    if (copy.buffer == NULL) {
-        return false;
-    }
+static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
+    return 1;
-    struct lm_ggml_cgraph * g1 = graph;
-    struct lm_ggml_cgraph * g2 = copy.graph;
+    LM_GGML_UNUSED(reg);
+}
-    assert(g1->n_nodes == g2->n_nodes);
+static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
+    LM_GGML_ASSERT(index == 0);
-    for (int i = 0; i < g1->n_nodes; i++) {
-        //printf("eval %d/%d\n", i, g1->n_nodes);
-        struct lm_ggml_tensor * t1 = g1->nodes[i];
-        struct lm_ggml_tensor * t2 = g2->nodes[i];
+    static lm_ggml_backend_cpu_device_context ctx;
+    static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
+        /* .iface   = */ lm_ggml_backend_cpu_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ &ctx,
+    };
-        assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
+    return &lm_ggml_backend_cpu_device;
+}
-        struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
-        struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
+static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
+        return (void *)lm_ggml_backend_cpu_set_n_threads;
+    }
+    if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
+        return (void *)lm_ggml_backend_cpu_get_extra_bufts;
+    }
-        lm_ggml_backend_graph_compute(backend1, &g1v);
-        lm_ggml_backend_graph_compute(backend2, &g2v);
+    return NULL;
-        if (lm_ggml_is_view_op(t1->op)) {
-            continue;
-        }
+    LM_GGML_UNUSED(reg);
+}
-        // compare results, calculate rms etc
-        if (!callback(i, t1, t2, user_data)) {
-            break;
-        }
-    }
+static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
+    /* .get_name         = */ lm_ggml_backend_cpu_reg_get_name,
+    /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
+    /* .get_device       = */ lm_ggml_backend_cpu_reg_get_device,
+    /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
+};
-    lm_ggml_backend_graph_copy_free(copy);
+lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
+    static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
+        /* .iface   = */ lm_ggml_backend_cpu_reg_i,
+        /* .context = */ NULL,
+    };
-    return true;
+    return &lm_ggml_backend_cpu_reg;
 }