npm - whisper.rn - Versions diffs - 0.5.0 → 0.5.2 - Mend

whisper.rn 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/cpp/ggml-impl.h CHANGED Viewed

@@ -73,7 +73,7 @@ static inline int wsp_ggml_up(int n, int m) {
     return (n + m - 1) & ~(m - 1);
 }
-// TODO: move to ggml.h?
+// TODO: move to ggml.h? (won't be able to inline)
 static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
     if (a->type != b->type) {
         return false;
@@ -89,6 +89,22 @@ static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const str
     return true;
 }
+static bool wsp_ggml_op_is_empty(enum wsp_ggml_op op) {
+    switch (op) {
+        case WSP_GGML_OP_NONE:
+        case WSP_GGML_OP_RESHAPE:
+        case WSP_GGML_OP_TRANSPOSE:
+        case WSP_GGML_OP_VIEW:
+        case WSP_GGML_OP_PERMUTE:
+            return true;
+        default:
+            return false;
+    }
+}
+static inline float wsp_ggml_softplus(float input) {
+    return (input > 20.0f) ? input : logf(1 + expf(input));
+}
 //
 // logging
 //
@@ -329,6 +345,10 @@ struct wsp_ggml_cgraph {
 // if you need the gradients, get them from the original graph
 struct wsp_ggml_cgraph wsp_ggml_graph_view(struct wsp_ggml_cgraph * cgraph, int i0, int i1);
+// ggml-alloc.c: true if the operation can reuse memory from its sources
+WSP_GGML_API bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op);
 // Memory allocation
 WSP_GGML_API void * wsp_ggml_aligned_malloc(size_t size);
@@ -545,14 +565,23 @@ static inline wsp_ggml_bf16_t wsp_ggml_compute_fp32_to_bf16(float s) {
 #define WSP_GGML_FP32_TO_BF16(x) wsp_ggml_compute_fp32_to_bf16(x)
 #define WSP_GGML_BF16_TO_FP32(x) wsp_ggml_compute_bf16_to_fp32(x)
+static inline int32_t wsp_ggml_node_get_use_count(const struct wsp_ggml_cgraph * cgraph, int node_idx) {
+    const struct wsp_ggml_tensor * node = cgraph->nodes[node_idx];
+    size_t hash_pos = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
+    if (!wsp_ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos)) {
+        return 0;
+    }
+    return cgraph->use_counts[hash_pos];
+}
 // return true if the node's results are only used by N other nodes
 // and can be fused into their calculations.
 static inline bool wsp_ggml_node_has_n_uses(const struct wsp_ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
     const struct wsp_ggml_tensor * node = cgraph->nodes[node_idx];
     // check the use count against how many we're replacing
-    size_t hash_pos = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
-    if (!wsp_ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) {
+    if (wsp_ggml_node_get_use_count(cgraph, node_idx) != n_uses) {
         return false;
     }
@@ -570,27 +599,27 @@ static inline bool wsp_ggml_node_has_n_uses(const struct wsp_ggml_cgraph * cgrap
     return true;
 }
-// Returns true if nodes [i, i+ops.size()) are the sequence of wsp_ggml_ops in ops[]
+// Returns true if nodes with indices { node_idxs } are the sequence of wsp_ggml_ops in ops[]
 // and are fusable. Nodes are considered fusable according to this function if:
 // - all nodes except the last have only one use and are not views/outputs (see wsp_ggml_node_has_N_uses).
 // - all nodes except the last are a src of the following node.
 // - all nodes are the same shape.
 // TODO: Consider allowing WSP_GGML_OP_NONE nodes in between
-static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_idx, const enum wsp_ggml_op * ops, int num_ops) {
-    if (node_idx + num_ops > cgraph->n_nodes) {
-        return false;
-    }
+static inline bool wsp_ggml_can_fuse_ext(const struct wsp_ggml_cgraph * cgraph, const int * node_idxs, const enum wsp_ggml_op * ops, int num_ops) {
     for (int i = 0; i < num_ops; ++i) {
-        struct wsp_ggml_tensor * node = cgraph->nodes[node_idx + i];
+        if (node_idxs[i] >= cgraph->n_nodes) {
+            return false;
+        }
+        struct wsp_ggml_tensor * node = cgraph->nodes[node_idxs[i]];
         if (node->op != ops[i]) {
             return false;
         }
-        if (i < num_ops - 1 && !wsp_ggml_node_has_n_uses(cgraph, node_idx + i, 1)) {
+        if (i < num_ops - 1 && !wsp_ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
             return false;
         }
         if (i > 0) {
-            struct wsp_ggml_tensor * prev = cgraph->nodes[node_idx + i - 1];
+            struct wsp_ggml_tensor * prev = cgraph->nodes[node_idxs[i - 1]];
             if (node->src[0] != prev && node->src[1] != prev) {
                 return false;
             }
@@ -602,6 +631,52 @@ static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int
     return true;
 }
+// same as above, for sequential indices starting at node_idx
+static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_idx, const enum wsp_ggml_op * ops, int num_ops) {
+    assert(num_ops < 32);
+    if (node_idx + num_ops > cgraph->n_nodes) {
+        return false;
+    }
+    int idxs[32];
+    for (int i = 0; i < num_ops; ++i) {
+        idxs[i] = node_idx + i;
+    }
+    return wsp_ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
+}
+WSP_GGML_API bool wsp_ggml_can_fuse_subgraph_ext(const struct wsp_ggml_cgraph * cgraph,
+                                         const int *                node_idxs,
+                                         int                        count,
+                                         const enum wsp_ggml_op *       ops,
+                                         const int *                outputs,
+                                         int                        num_outputs);
+// Returns true if the subgraph formed by {node_idxs} can be fused
+// checks whethers all nodes which are not part of outputs can be elided
+// by checking if their num_uses are confined to the subgraph
+static inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph * cgraph,
+                                          int                        node_idx,
+                                          int                        count,
+                                          const enum wsp_ggml_op *       ops,
+                                          const int *                outputs,
+                                          int                        num_outputs) {
+    WSP_GGML_ASSERT(count < 32);
+    if (node_idx + count > cgraph->n_nodes) {
+        return false;
+    }
+    int idxs[32];
+    for (int i = 0; i < count; ++i) {
+        idxs[i] = node_idx + i;
+    }
+    return wsp_ggml_can_fuse_subgraph_ext(cgraph, idxs, count, ops, outputs, num_outputs);
+}
 #ifdef __cplusplus
 }
 #endif
@@ -615,6 +690,13 @@ inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_id
     return wsp_ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size());
 }
+inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph *          cgraph,
+                                   int                                 start_idx,
+                                   std::initializer_list<enum wsp_ggml_op> ops,
+                                   std::initializer_list<int>          outputs = {}) {
+    return wsp_ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
+}
 // expose GGUF internals for test code
 WSP_GGML_API size_t wsp_gguf_type_size(enum wsp_gguf_type type);
 WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_gguf_init_params params);

package/cpp/ggml-metal/ggml-metal-common.cpp ADDED Viewed

@@ -0,0 +1,446 @@
+#include "ggml-metal-common.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include <vector>
+// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb)
+// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it)
+struct wsp_ggml_mem_range {
+    uint64_t pb; // buffer id
+    uint64_t p0; // begin
+    uint64_t p1; // end
+    wsp_ggml_mem_range_type pt;
+};
+struct wsp_ggml_mem_ranges {
+    std::vector<wsp_ggml_mem_range> ranges;
+    int debug = 0;
+};
+wsp_ggml_mem_ranges_t wsp_ggml_mem_ranges_init(int debug) {
+    auto * res = new wsp_ggml_mem_ranges;
+    res->ranges.reserve(256);
+    res->debug = debug;
+    return res;
+}
+void wsp_ggml_mem_ranges_free(wsp_ggml_mem_ranges_t mrs) {
+    delete mrs;
+}
+void wsp_ggml_mem_ranges_reset(wsp_ggml_mem_ranges_t mrs) {
+    mrs->ranges.clear();
+}
+static bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, wsp_ggml_mem_range mr) {
+    mrs->ranges.push_back(mr);
+    return true;
+}
+static wsp_ggml_mem_range wsp_ggml_mem_range_from_tensor(const wsp_ggml_tensor * tensor, wsp_ggml_mem_range_type pt) {
+    // always use the base tensor
+    tensor = tensor->view_src ? tensor->view_src : tensor;
+    WSP_GGML_ASSERT(!tensor->view_src);
+    wsp_ggml_mem_range mr;
+    if (tensor->buffer) {
+        // when the tensor is allocated, use the actual memory address range in the buffer
+        //
+        // take the actual allocated size with wsp_ggml_backend_buft_get_alloc_size()
+        // this can be larger than the tensor size if the buffer type allocates extra memory
+        // ref: https://github.com/ggml-org/llama.cpp/pull/15966
+        mr = {
+            /*.pb =*/ (uint64_t) tensor->buffer,
+            /*.p0 =*/ (uint64_t) tensor->data,
+            /*.p1 =*/ (uint64_t) tensor->data + wsp_ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor),
+            /*.pt =*/ pt,
+        };
+    } else {
+        // otherwise, the pointer address is used as an unique id of the memory ranges
+        //   that the tensor will be using when it is allocated
+        mr = {
+            /*.pb =*/ (uint64_t) tensor,
+            /*.p0 =*/ 0,    //
+            /*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used
+            /*.pt =*/ pt,
+        };
+    };
+    return mr;
+}
+static wsp_ggml_mem_range wsp_ggml_mem_range_from_tensor_src(const wsp_ggml_tensor * tensor) {
+    return wsp_ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_SRC);
+}
+static wsp_ggml_mem_range wsp_ggml_mem_range_from_tensor_dst(const wsp_ggml_tensor * tensor) {
+    return wsp_ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_DST);
+}
+static bool wsp_ggml_mem_ranges_add_src(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_src(tensor);
+    if (mrs->debug > 2) {
+        WSP_GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
+    }
+    return wsp_ggml_mem_ranges_add(mrs, mr);
+}
+static bool wsp_ggml_mem_ranges_add_dst(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_dst(tensor);
+    if (mrs->debug > 2) {
+        WSP_GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
+    }
+    return wsp_ggml_mem_ranges_add(mrs, mr);
+}
+bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+        if (tensor->src[i]) {
+            wsp_ggml_mem_ranges_add_src(mrs, tensor->src[i]);
+        }
+    }
+    return wsp_ggml_mem_ranges_add_dst(mrs, tensor);
+}
+static bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, wsp_ggml_mem_range mr) {
+    for (size_t i = 0; i < mrs->ranges.size(); i++) {
+        const auto & cmp = mrs->ranges[i];
+        // two memory ranges cannot intersect if they are in different buffers
+        if (mr.pb != cmp.pb) {
+            continue;
+        }
+        // intersecting source ranges are allowed
+        if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
+            continue;
+        }
+        if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) {
+            if (mrs->debug > 2) {
+                WSP_GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n",
+                        __func__,
+                        mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        mr.pb, mr.p0, mr.p1,
+                        cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        cmp.pb, cmp.p0, cmp.p1);
+            }
+            return false;
+        }
+    }
+    return true;
+}
+static bool wsp_ggml_mem_ranges_check_src(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_src(tensor);
+    const bool res = wsp_ggml_mem_ranges_check(mrs, mr);
+    return res;
+}
+static bool wsp_ggml_mem_ranges_check_dst(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_dst(tensor);
+    const bool res = wsp_ggml_mem_ranges_check(mrs, mr);
+    return res;
+}
+bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+        if (tensor->src[i]) {
+            if (!wsp_ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
+                return false;
+            }
+        }
+    }
+    return wsp_ggml_mem_ranges_check_dst(mrs, tensor);
+}
+struct node_info {
+    wsp_ggml_tensor * node;
+    std::vector<wsp_ggml_tensor *> fused;
+    wsp_ggml_op op() const {
+        return node->op;
+    }
+    const wsp_ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
+    bool is_empty() const {
+        return wsp_ggml_op_is_empty(node->op);
+    }
+    void add_fused(wsp_ggml_tensor * t) {
+        fused.push_back(t);
+    }
+};
+static std::vector<int> wsp_ggml_metal_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+    // helper to add node src and dst ranges
+    const auto & h_add = [](wsp_ggml_mem_ranges_t mrs, const node_info & node) {
+        for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+            if (node.node->src[i]) {
+                if (!wsp_ggml_mem_ranges_add_src(mrs, node.node->src[i])) {
+                    return false;
+                }
+            }
+        }
+        // keep track of the sources of the fused nodes as well
+        for (const auto * fused : node.fused) {
+            for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+                if (fused->src[i]) {
+                    if (!wsp_ggml_mem_ranges_add_src(mrs, fused->src[i])) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return wsp_ggml_mem_ranges_add_dst(mrs, node.dst());
+    };
+    // helper to check if a node can run concurrently with the existing set of nodes
+    const auto & h_check = [](wsp_ggml_mem_ranges_t mrs, const node_info & node) {
+        for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+            if (node.node->src[i]) {
+                if (!wsp_ggml_mem_ranges_check_src(mrs, node.node->src[i])) {
+                    return false;
+                }
+            }
+        }
+        for (const auto * fused : node.fused) {
+            for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+                if (fused->src[i]) {
+                    if (!wsp_ggml_mem_ranges_check_src(mrs, fused->src[i])) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return wsp_ggml_mem_ranges_check_dst(mrs, node.dst());
+    };
+    // perform reorders only across these types of ops
+    // can be expanded when needed
+    const auto & h_safe = [](wsp_ggml_op op) {
+        switch (op) {
+            case WSP_GGML_OP_MUL_MAT:
+            case WSP_GGML_OP_MUL_MAT_ID:
+            case WSP_GGML_OP_ROPE:
+            case WSP_GGML_OP_NORM:
+            case WSP_GGML_OP_RMS_NORM:
+            case WSP_GGML_OP_GROUP_NORM:
+            case WSP_GGML_OP_SUM_ROWS:
+            case WSP_GGML_OP_MUL:
+            case WSP_GGML_OP_ADD:
+            case WSP_GGML_OP_DIV:
+            case WSP_GGML_OP_GLU:
+            case WSP_GGML_OP_SCALE:
+            case WSP_GGML_OP_GET_ROWS:
+            case WSP_GGML_OP_CPY:
+            case WSP_GGML_OP_SET_ROWS:
+                return true;
+            default:
+                return wsp_ggml_op_is_empty(op);
+        }
+    };
+    const int n = nodes.size();
+    std::vector<int> res;
+    res.reserve(n);
+    std::vector<bool> used(n, false);
+    // the memory ranges for the set of currently concurrent nodes
+    wsp_ggml_mem_ranges_t mrs0 = wsp_ggml_mem_ranges_init(0);
+    // the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder
+    wsp_ggml_mem_ranges_t mrs1 = wsp_ggml_mem_ranges_init(0);
+    for (int i0 = 0; i0 < n; i0++) {
+        if (used[i0]) {
+            continue;
+        }
+        const auto & node0 = nodes[i0];
+        // the node is not concurrent with the existing concurrent set, so we have to "put a barrier" (i.e reset mrs0)
+        // but before we do that, look forward for some other nodes that can be added to the concurrent set mrs0
+        //
+        // note: we can always add empty nodes to the concurrent set as they don't read nor write anything
+        if (!node0.is_empty() && !h_check(mrs0, node0)) {
+            // this will hold the set of memory ranges from the nodes that haven't been processed yet
+            // if a node is not concurrent with this set, we cannot reorder it
+            wsp_ggml_mem_ranges_reset(mrs1);
+            // initialize it with the current node
+            h_add(mrs1, node0);
+            // that many nodes forward to search for a concurrent node
+            constexpr int N_FORWARD = 8;
+            for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
+                if (used[i1]) {
+                    continue;
+                }
+                const auto & node1 = nodes[i1];
+                // disallow reordering of certain ops
+                if (!h_safe(node1.op())) {
+                    break;
+                }
+                const bool is_empty = node1.is_empty();
+                // to reorder a node and add it to the concurrent set, it has to be:
+                //   + empty or concurrent with all nodes in the existing concurrent set (mrs0)
+                //   + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
+                if ((is_empty || h_check(mrs0, node1)) && h_check(mrs1, node1)) {
+                    // add the node to the existing concurrent set (i.e. reorder it for early execution)
+                    h_add(mrs0, node1);
+                    res.push_back(i1);
+                    // mark as used, so we skip re-processing it later
+                    used[i1] = true;
+                } else {
+                    // expand the set of nodes that haven't been processed yet
+                    h_add(mrs1, node1);
+                }
+            }
+            // finalize the concurrent set and begin a new one
+            wsp_ggml_mem_ranges_reset(mrs0);
+        }
+        // expand the concurrent set with the current node
+        {
+            h_add(mrs0, node0);
+            res.push_back(i0);
+        }
+    }
+    wsp_ggml_mem_ranges_free(mrs0);
+    wsp_ggml_mem_ranges_free(mrs1);
+    return res;
+}
+void wsp_ggml_graph_optimize(wsp_ggml_cgraph * gf) {
+    constexpr int MAX_FUSE = 16;
+    const int n = gf->n_nodes;
+    enum wsp_ggml_op ops[MAX_FUSE];
+    std::vector<node_info> nodes;
+    nodes.reserve(gf->n_nodes);
+    // fuse nodes:
+    // we don't want to make reorders that break fusing, so we first pack all fusable tensors
+    //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
+    for (int i = 0; i < n; i++) {
+        node_info node = {
+            /*.node =*/ gf->nodes[i],
+            /*.fused =*/ {},
+        };
+        // fuse only ops that start with these operations
+        // can be expanded when needed
+        if (node.op() == WSP_GGML_OP_ADD ||
+            node.op() == WSP_GGML_OP_NORM ||
+            node.op() == WSP_GGML_OP_RMS_NORM) {
+            ops[0] = node.op();
+            int f = i + 1;
+            while (f < n && f < i + MAX_FUSE) {
+                // conservatively allow fusing only these ops
+                // can be expanded when needed
+                if (gf->nodes[f]->op != WSP_GGML_OP_ADD &&
+                    gf->nodes[f]->op != WSP_GGML_OP_MUL &&
+                    gf->nodes[f]->op != WSP_GGML_OP_NORM &&
+                    gf->nodes[f]->op != WSP_GGML_OP_RMS_NORM) {
+                    break;
+                }
+                ops[f - i] = gf->nodes[f]->op;
+                f++;
+            }
+            f -= i;
+            for (; f > 1; f--) {
+                if (wsp_ggml_can_fuse(gf, i, ops, f)) {
+                    break;
+                }
+            }
+            // add the fused tensors into the node info so we can unfuse them later
+            for (int k = 1; k < f; k++) {
+                ++i;
+                // the .dst() becomes the last fused tensor
+                node.add_fused(gf->nodes[i]);
+            }
+        }
+        nodes.push_back(std::move(node));
+    }
+#if 1
+    // reorder to improve concurrency
+    const auto order = wsp_ggml_metal_graph_optimize_reorder(nodes);
+#else
+    std::vector<int> order(nodes.size());
+    for (size_t i = 0; i < nodes.size(); i++) {
+        order[i] = i;
+    }
+#endif
+    // unfuse
+    {
+        int j = 0;
+        for (const auto i : order) {
+            const auto & node = nodes[i];
+            gf->nodes[j++] = node.node;
+            for (auto * fused : node.fused) {
+                gf->nodes[j++] = fused;
+            }
+        }
+    }
+}

package/cpp/ggml-metal/ggml-metal-common.h ADDED Viewed

@@ -0,0 +1,52 @@
+// helper functions for ggml-metal that are too difficult to implement in Objective-C
+#pragma once
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct wsp_ggml_tensor;
+struct wsp_ggml_cgraph;
+enum wsp_ggml_mem_range_type {
+    MEM_RANGE_TYPE_SRC = 0,
+    MEM_RANGE_TYPE_DST = 1,
+};
+// a helper object that can be used for reordering operations to improve concurrency
+//
+// the fundamental idea is that a set of tasks (either ggml ops, or something else) can run concurrently if they
+//   don't write to a memory that is being read by another task or written to by another task in the set
+//
+// with this structure, we can add tasks to the set, setting memory constraints. we can also check if a new task
+//   can be added to the set without violating the constraints (i.e. if it can be executed concurrently with the
+//   tasks already in the set)
+//
+typedef struct wsp_ggml_mem_ranges * wsp_ggml_mem_ranges_t;
+wsp_ggml_mem_ranges_t wsp_ggml_mem_ranges_init(int debug);
+void wsp_ggml_mem_ranges_free(wsp_ggml_mem_ranges_t mrs);
+// remove all ranges from the set
+void wsp_ggml_mem_ranges_reset(wsp_ggml_mem_ranges_t mrs);
+// add src or dst ranges to track
+bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, const struct wsp_ggml_tensor * tensor);
+// return false if:
+// - new src range overlaps with any existing dst range
+// - new dst range overlaps with any existing range (src or dst)
+bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, const struct wsp_ggml_tensor * tensor);
+// reorder the nodes in the graph to improve concurrency, while respecting fusion
+//
+// note: this implementation is generic and not specific to metal
+//       if it proves to work well, we can start using it for other backends in the future
+void wsp_ggml_graph_optimize(struct wsp_ggml_cgraph * gf);
+#ifdef __cplusplus
+}
+#endif

package/cpp/ggml-metal/ggml-metal-context.h ADDED Viewed

@@ -0,0 +1,33 @@
+#pragma once
+#include "ggml-metal-device.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+//
+// backend context
+//
+typedef struct wsp_ggml_metal * wsp_ggml_metal_t;
+wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev);
+void wsp_ggml_metal_free(wsp_ggml_metal_t ctx);
+void wsp_ggml_metal_synchronize(wsp_ggml_metal_t ctx);
+void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size);
+enum wsp_ggml_status wsp_ggml_metal_graph_compute (wsp_ggml_metal_t ctx, struct wsp_ggml_cgraph * gf);
+void             wsp_ggml_metal_graph_optimize(wsp_ggml_metal_t ctx, struct wsp_ggml_cgraph * gf);
+void wsp_ggml_metal_set_n_cb            (wsp_ggml_metal_t ctx, int n_cb);
+void wsp_ggml_metal_set_abort_callback  (wsp_ggml_metal_t ctx, wsp_ggml_abort_callback abort_callback, void * user_data);
+bool wsp_ggml_metal_supports_family     (wsp_ggml_metal_t ctx, int family);
+void wsp_ggml_metal_capture_next_compute(wsp_ggml_metal_t ctx);
+#ifdef __cplusplus
+}
+#endif