npm - whisper.rn - Versions diffs - 0.5.0-rc.9 → 0.5.1 - Mend

whisper.rn 0.5.0-rc.9 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

package/cpp/ggml-cpu.h CHANGED Viewed

@@ -101,7 +101,6 @@ extern "C" {
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v    (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx        (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe        (void);
-    WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa       (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd  (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile  (void);
@@ -135,6 +134,7 @@ extern "C" {
     WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
+    WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);

package/cpp/ggml-impl.h CHANGED Viewed

@@ -73,6 +73,35 @@ static inline int wsp_ggml_up(int n, int m) {
     return (n + m - 1) & ~(m - 1);
 }
+// TODO: move to ggml.h? (won't be able to inline)
+static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+static bool wsp_ggml_op_is_empty(enum wsp_ggml_op op) {
+    switch (op) {
+        case WSP_GGML_OP_NONE:
+        case WSP_GGML_OP_RESHAPE:
+        case WSP_GGML_OP_TRANSPOSE:
+        case WSP_GGML_OP_VIEW:
+        case WSP_GGML_OP_PERMUTE:
+            return true;
+        default:
+            return false;
+    }
+}
 //
 // logging
 //
@@ -313,6 +342,10 @@ struct wsp_ggml_cgraph {
 // if you need the gradients, get them from the original graph
 struct wsp_ggml_cgraph wsp_ggml_graph_view(struct wsp_ggml_cgraph * cgraph, int i0, int i1);
+// ggml-alloc.c: true if the operation can reuse memory from its sources
+WSP_GGML_API bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op);
 // Memory allocation
 WSP_GGML_API void * wsp_ggml_aligned_malloc(size_t size);
@@ -394,6 +427,67 @@ static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) {
 #define WSP_GGML_FP16_TO_FP32(x) WSP_GGML_COMPUTE_FP16_TO_FP32(x)
 #define WSP_GGML_FP32_TO_FP16(x) WSP_GGML_COMPUTE_FP32_TO_FP16(x)
+static inline float wsp_ggml_e8m0_to_fp32(uint8_t x) {
+    uint32_t bits;  // Stores the raw bit representation of the float
+    // Handle special case for minimum exponent (denormalized float)
+    if (x == 0) {
+        // Bit pattern for 2^(-127):
+        // - Sign bit: 0 (positive)
+        // - Exponent: 0 (denormalized number)
+        // - Mantissa: 0x400000 (0.5 in fractional form)
+        // Value = 0.5 * 2^(-126) = 2^(-127)
+        bits = 0x00400000;
+    }
+    // note: disabled as we don't need to handle NaNs
+    //// Handle special case for NaN (all bits set)
+    //else if (x == 0xFF) {
+    //    // Standard quiet NaN pattern:
+    //    // - Sign bit: 0
+    //    // - Exponent: all 1s (0xFF)
+    //    // - Mantissa: 0x400000 (quiet NaN flag)
+    //    bits = 0x7FC00000;
+    //}
+    // Normalized values (most common case)
+    else {
+        // Construct normalized float by shifting exponent into position:
+        // - Exponent field: 8 bits (positions 30-23)
+        // - Mantissa: 0 (implicit leading 1)
+        // Value = 2^(x - 127)
+        bits = (uint32_t) x << 23;
+    }
+    float result;  // Final float value
+                   // Safely reinterpret bit pattern as float without type-punning issues
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+// Equal to wsp_ggml_e8m0_to_fp32/2
+// Useful with MXFP4 quantization since the E0M2 values are doubled
+static inline float wsp_ggml_e8m0_to_fp32_half(uint8_t x) {
+    uint32_t bits;
+    // For x < 2: use precomputed denormal patterns
+    if (x < 2) {
+        // 0x00200000 = 2^(-128), 0x00400000 = 2^(-127)
+        bits = 0x00200000 << x;
+    }
+    // For x >= 2: normalized exponent adjustment
+    else {
+        // 0.5 * 2^(x-127) = 2^(x-128) = normalized with exponent (x-1)
+        bits = (uint32_t)(x - 1) << 23;
+    }
+    // Note: NaNs are not handled here
+    float result;
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+#define WSP_GGML_E8M0_TO_FP32(x) wsp_ggml_e8m0_to_fp32(x)
+#define WSP_GGML_E8M0_TO_FP32_HALF(x) wsp_ggml_e8m0_to_fp32_half(x)
 /**
  * Converts brain16 to float32.
  *
@@ -493,27 +587,27 @@ static inline bool wsp_ggml_node_has_n_uses(const struct wsp_ggml_cgraph * cgrap
     return true;
 }
-// Returns true if nodes [i, i+ops.size()) are the sequence of wsp_ggml_ops in ops[]
+// Returns true if nodes with indices { node_idxs } are the sequence of wsp_ggml_ops in ops[]
 // and are fusable. Nodes are considered fusable according to this function if:
 // - all nodes except the last have only one use and are not views/outputs (see wsp_ggml_node_has_N_uses).
 // - all nodes except the last are a src of the following node.
 // - all nodes are the same shape.
 // TODO: Consider allowing WSP_GGML_OP_NONE nodes in between
-static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_idx, const enum wsp_ggml_op * ops, int num_ops) {
-    if (node_idx + num_ops > cgraph->n_nodes) {
-        return false;
-    }
+static inline bool wsp_ggml_can_fuse_ext(const struct wsp_ggml_cgraph * cgraph, const int * node_idxs, const enum wsp_ggml_op * ops, int num_ops) {
     for (int i = 0; i < num_ops; ++i) {
-        struct wsp_ggml_tensor * node = cgraph->nodes[node_idx + i];
+        if (node_idxs[i] >= cgraph->n_nodes) {
+            return false;
+        }
+        struct wsp_ggml_tensor * node = cgraph->nodes[node_idxs[i]];
         if (node->op != ops[i]) {
             return false;
         }
-        if (i < num_ops - 1 && !wsp_ggml_node_has_n_uses(cgraph, node_idx + i, 1)) {
+        if (i < num_ops - 1 && !wsp_ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
             return false;
         }
         if (i > 0) {
-            struct wsp_ggml_tensor * prev = cgraph->nodes[node_idx + i - 1];
+            struct wsp_ggml_tensor * prev = cgraph->nodes[node_idxs[i - 1]];
             if (node->src[0] != prev && node->src[1] != prev) {
                 return false;
             }
@@ -525,6 +619,22 @@ static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int
     return true;
 }
+// same as above, for sequential indices starting at node_idx
+static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_idx, const enum wsp_ggml_op * ops, int num_ops) {
+    assert(num_ops < 32);
+    if (node_idx + num_ops > cgraph->n_nodes) {
+        return false;
+    }
+    int idxs[32];
+    for (int i = 0; i < num_ops; ++i) {
+        idxs[i] = node_idx + i;
+    }
+    return wsp_ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
+}
 #ifdef __cplusplus
 }
 #endif

package/cpp/ggml-metal/ggml-metal-common.cpp ADDED Viewed

@@ -0,0 +1,446 @@
+#include "ggml-metal-common.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include <vector>
+// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb)
+// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it)
+struct wsp_ggml_mem_range {
+    uint64_t pb; // buffer id
+    uint64_t p0; // begin
+    uint64_t p1; // end
+    wsp_ggml_mem_range_type pt;
+};
+struct wsp_ggml_mem_ranges {
+    std::vector<wsp_ggml_mem_range> ranges;
+    int debug = 0;
+};
+wsp_ggml_mem_ranges_t wsp_ggml_mem_ranges_init(int debug) {
+    auto * res = new wsp_ggml_mem_ranges;
+    res->ranges.reserve(256);
+    res->debug = debug;
+    return res;
+}
+void wsp_ggml_mem_ranges_free(wsp_ggml_mem_ranges_t mrs) {
+    delete mrs;
+}
+void wsp_ggml_mem_ranges_reset(wsp_ggml_mem_ranges_t mrs) {
+    mrs->ranges.clear();
+}
+static bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, wsp_ggml_mem_range mr) {
+    mrs->ranges.push_back(mr);
+    return true;
+}
+static wsp_ggml_mem_range wsp_ggml_mem_range_from_tensor(const wsp_ggml_tensor * tensor, wsp_ggml_mem_range_type pt) {
+    // always use the base tensor
+    tensor = tensor->view_src ? tensor->view_src : tensor;
+    WSP_GGML_ASSERT(!tensor->view_src);
+    wsp_ggml_mem_range mr;
+    if (tensor->buffer) {
+        // when the tensor is allocated, use the actual memory address range in the buffer
+        //
+        // take the actual allocated size with wsp_ggml_backend_buft_get_alloc_size()
+        // this can be larger than the tensor size if the buffer type allocates extra memory
+        // ref: https://github.com/ggml-org/llama.cpp/pull/15966
+        mr = {
+            /*.pb =*/ (uint64_t) tensor->buffer,
+            /*.p0 =*/ (uint64_t) tensor->data,
+            /*.p1 =*/ (uint64_t) tensor->data + wsp_ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor),
+            /*.pt =*/ pt,
+        };
+    } else {
+        // otherwise, the pointer address is used as an unique id of the memory ranges
+        //   that the tensor will be using when it is allocated
+        mr = {
+            /*.pb =*/ (uint64_t) tensor,
+            /*.p0 =*/ 0,    //
+            /*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used
+            /*.pt =*/ pt,
+        };
+    };
+    return mr;
+}
+static wsp_ggml_mem_range wsp_ggml_mem_range_from_tensor_src(const wsp_ggml_tensor * tensor) {
+    return wsp_ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_SRC);
+}
+static wsp_ggml_mem_range wsp_ggml_mem_range_from_tensor_dst(const wsp_ggml_tensor * tensor) {
+    return wsp_ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_DST);
+}
+static bool wsp_ggml_mem_ranges_add_src(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_src(tensor);
+    if (mrs->debug > 2) {
+        WSP_GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
+    }
+    return wsp_ggml_mem_ranges_add(mrs, mr);
+}
+static bool wsp_ggml_mem_ranges_add_dst(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_dst(tensor);
+    if (mrs->debug > 2) {
+        WSP_GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1);
+    }
+    return wsp_ggml_mem_ranges_add(mrs, mr);
+}
+bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+        if (tensor->src[i]) {
+            wsp_ggml_mem_ranges_add_src(mrs, tensor->src[i]);
+        }
+    }
+    return wsp_ggml_mem_ranges_add_dst(mrs, tensor);
+}
+static bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, wsp_ggml_mem_range mr) {
+    for (size_t i = 0; i < mrs->ranges.size(); i++) {
+        const auto & cmp = mrs->ranges[i];
+        // two memory ranges cannot intersect if they are in different buffers
+        if (mr.pb != cmp.pb) {
+            continue;
+        }
+        // intersecting source ranges are allowed
+        if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
+            continue;
+        }
+        if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) {
+            if (mrs->debug > 2) {
+                WSP_GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n",
+                        __func__,
+                        mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        mr.pb, mr.p0, mr.p1,
+                        cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        cmp.pb, cmp.p0, cmp.p1);
+            }
+            return false;
+        }
+    }
+    return true;
+}
+static bool wsp_ggml_mem_ranges_check_src(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_src(tensor);
+    const bool res = wsp_ggml_mem_ranges_check(mrs, mr);
+    return res;
+}
+static bool wsp_ggml_mem_ranges_check_dst(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    WSP_GGML_ASSERT(tensor);
+    wsp_ggml_mem_range mr = wsp_ggml_mem_range_from_tensor_dst(tensor);
+    const bool res = wsp_ggml_mem_ranges_check(mrs, mr);
+    return res;
+}
+bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
+    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+        if (tensor->src[i]) {
+            if (!wsp_ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
+                return false;
+            }
+        }
+    }
+    return wsp_ggml_mem_ranges_check_dst(mrs, tensor);
+}
+struct node_info {
+    wsp_ggml_tensor * node;
+    std::vector<wsp_ggml_tensor *> fused;
+    wsp_ggml_op op() const {
+        return node->op;
+    }
+    const wsp_ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
+    bool is_empty() const {
+        return wsp_ggml_op_is_empty(node->op);
+    }
+    void add_fused(wsp_ggml_tensor * t) {
+        fused.push_back(t);
+    }
+};
+static std::vector<int> wsp_ggml_metal_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+    // helper to add node src and dst ranges
+    const auto & h_add = [](wsp_ggml_mem_ranges_t mrs, const node_info & node) {
+        for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+            if (node.node->src[i]) {
+                if (!wsp_ggml_mem_ranges_add_src(mrs, node.node->src[i])) {
+                    return false;
+                }
+            }
+        }
+        // keep track of the sources of the fused nodes as well
+        for (const auto * fused : node.fused) {
+            for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+                if (fused->src[i]) {
+                    if (!wsp_ggml_mem_ranges_add_src(mrs, fused->src[i])) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return wsp_ggml_mem_ranges_add_dst(mrs, node.dst());
+    };
+    // helper to check if a node can run concurrently with the existing set of nodes
+    const auto & h_check = [](wsp_ggml_mem_ranges_t mrs, const node_info & node) {
+        for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+            if (node.node->src[i]) {
+                if (!wsp_ggml_mem_ranges_check_src(mrs, node.node->src[i])) {
+                    return false;
+                }
+            }
+        }
+        for (const auto * fused : node.fused) {
+            for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
+                if (fused->src[i]) {
+                    if (!wsp_ggml_mem_ranges_check_src(mrs, fused->src[i])) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return wsp_ggml_mem_ranges_check_dst(mrs, node.dst());
+    };
+    // perform reorders only across these types of ops
+    // can be expanded when needed
+    const auto & h_safe = [](wsp_ggml_op op) {
+        switch (op) {
+            case WSP_GGML_OP_MUL_MAT:
+            case WSP_GGML_OP_MUL_MAT_ID:
+            case WSP_GGML_OP_ROPE:
+            case WSP_GGML_OP_NORM:
+            case WSP_GGML_OP_RMS_NORM:
+            case WSP_GGML_OP_GROUP_NORM:
+            case WSP_GGML_OP_SUM_ROWS:
+            case WSP_GGML_OP_MUL:
+            case WSP_GGML_OP_ADD:
+            case WSP_GGML_OP_DIV:
+            case WSP_GGML_OP_GLU:
+            case WSP_GGML_OP_SCALE:
+            case WSP_GGML_OP_GET_ROWS:
+            case WSP_GGML_OP_CPY:
+            case WSP_GGML_OP_SET_ROWS:
+                return true;
+            default:
+                return wsp_ggml_op_is_empty(op);
+        }
+    };
+    const int n = nodes.size();
+    std::vector<int> res;
+    res.reserve(n);
+    std::vector<bool> used(n, false);
+    // the memory ranges for the set of currently concurrent nodes
+    wsp_ggml_mem_ranges_t mrs0 = wsp_ggml_mem_ranges_init(0);
+    // the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder
+    wsp_ggml_mem_ranges_t mrs1 = wsp_ggml_mem_ranges_init(0);
+    for (int i0 = 0; i0 < n; i0++) {
+        if (used[i0]) {
+            continue;
+        }
+        const auto & node0 = nodes[i0];
+        // the node is not concurrent with the existing concurrent set, so we have to "put a barrier" (i.e reset mrs0)
+        // but before we do that, look forward for some other nodes that can be added to the concurrent set mrs0
+        //
+        // note: we can always add empty nodes to the concurrent set as they don't read nor write anything
+        if (!node0.is_empty() && !h_check(mrs0, node0)) {
+            // this will hold the set of memory ranges from the nodes that haven't been processed yet
+            // if a node is not concurrent with this set, we cannot reorder it
+            wsp_ggml_mem_ranges_reset(mrs1);
+            // initialize it with the current node
+            h_add(mrs1, node0);
+            // that many nodes forward to search for a concurrent node
+            constexpr int N_FORWARD = 8;
+            for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
+                if (used[i1]) {
+                    continue;
+                }
+                const auto & node1 = nodes[i1];
+                // disallow reordering of certain ops
+                if (!h_safe(node1.op())) {
+                    break;
+                }
+                const bool is_empty = node1.is_empty();
+                // to reorder a node and add it to the concurrent set, it has to be:
+                //   + empty or concurrent with all nodes in the existing concurrent set (mrs0)
+                //   + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
+                if ((is_empty || h_check(mrs0, node1)) && h_check(mrs1, node1)) {
+                    // add the node to the existing concurrent set (i.e. reorder it for early execution)
+                    h_add(mrs0, node1);
+                    res.push_back(i1);
+                    // mark as used, so we skip re-processing it later
+                    used[i1] = true;
+                } else {
+                    // expand the set of nodes that haven't been processed yet
+                    h_add(mrs1, node1);
+                }
+            }
+            // finalize the concurrent set and begin a new one
+            wsp_ggml_mem_ranges_reset(mrs0);
+        }
+        // expand the concurrent set with the current node
+        {
+            h_add(mrs0, node0);
+            res.push_back(i0);
+        }
+    }
+    wsp_ggml_mem_ranges_free(mrs0);
+    wsp_ggml_mem_ranges_free(mrs1);
+    return res;
+}
+void wsp_ggml_graph_optimize(wsp_ggml_cgraph * gf) {
+    constexpr int MAX_FUSE = 16;
+    const int n = gf->n_nodes;
+    enum wsp_ggml_op ops[MAX_FUSE];
+    std::vector<node_info> nodes;
+    nodes.reserve(gf->n_nodes);
+    // fuse nodes:
+    // we don't want to make reorders that break fusing, so we first pack all fusable tensors
+    //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
+    for (int i = 0; i < n; i++) {
+        node_info node = {
+            /*.node =*/ gf->nodes[i],
+            /*.fused =*/ {},
+        };
+        // fuse only ops that start with these operations
+        // can be expanded when needed
+        if (node.op() == WSP_GGML_OP_ADD ||
+            node.op() == WSP_GGML_OP_NORM ||
+            node.op() == WSP_GGML_OP_RMS_NORM) {
+            ops[0] = node.op();
+            int f = i + 1;
+            while (f < n && f < i + MAX_FUSE) {
+                // conservatively allow fusing only these ops
+                // can be expanded when needed
+                if (gf->nodes[f]->op != WSP_GGML_OP_ADD &&
+                    gf->nodes[f]->op != WSP_GGML_OP_MUL &&
+                    gf->nodes[f]->op != WSP_GGML_OP_NORM &&
+                    gf->nodes[f]->op != WSP_GGML_OP_RMS_NORM) {
+                    break;
+                }
+                ops[f - i] = gf->nodes[f]->op;
+                f++;
+            }
+            f -= i;
+            for (; f > 1; f--) {
+                if (wsp_ggml_can_fuse(gf, i, ops, f)) {
+                    break;
+                }
+            }
+            // add the fused tensors into the node info so we can unfuse them later
+            for (int k = 1; k < f; k++) {
+                ++i;
+                // the .dst() becomes the last fused tensor
+                node.add_fused(gf->nodes[i]);
+            }
+        }
+        nodes.push_back(std::move(node));
+    }
+#if 1
+    // reorder to improve concurrency
+    const auto order = wsp_ggml_metal_graph_optimize_reorder(nodes);
+#else
+    std::vector<int> order(nodes.size());
+    for (size_t i = 0; i < nodes.size(); i++) {
+        order[i] = i;
+    }
+#endif
+    // unfuse
+    {
+        int j = 0;
+        for (const auto i : order) {
+            const auto & node = nodes[i];
+            gf->nodes[j++] = node.node;
+            for (auto * fused : node.fused) {
+                gf->nodes[j++] = fused;
+            }
+        }
+    }
+}

package/cpp/ggml-metal/ggml-metal-common.h ADDED Viewed

@@ -0,0 +1,52 @@
+// helper functions for ggml-metal that are too difficult to implement in Objective-C
+#pragma once
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct wsp_ggml_tensor;
+struct wsp_ggml_cgraph;
+enum wsp_ggml_mem_range_type {
+    MEM_RANGE_TYPE_SRC = 0,
+    MEM_RANGE_TYPE_DST = 1,
+};
+// a helper object that can be used for reordering operations to improve concurrency
+//
+// the fundamental idea is that a set of tasks (either ggml ops, or something else) can run concurrently if they
+//   don't write to a memory that is being read by another task or written to by another task in the set
+//
+// with this structure, we can add tasks to the set, setting memory constraints. we can also check if a new task
+//   can be added to the set without violating the constraints (i.e. if it can be executed concurrently with the
+//   tasks already in the set)
+//
+typedef struct wsp_ggml_mem_ranges * wsp_ggml_mem_ranges_t;
+wsp_ggml_mem_ranges_t wsp_ggml_mem_ranges_init(int debug);
+void wsp_ggml_mem_ranges_free(wsp_ggml_mem_ranges_t mrs);
+// remove all ranges from the set
+void wsp_ggml_mem_ranges_reset(wsp_ggml_mem_ranges_t mrs);
+// add src or dst ranges to track
+bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, const struct wsp_ggml_tensor * tensor);
+// return false if:
+// - new src range overlaps with any existing dst range
+// - new dst range overlaps with any existing range (src or dst)
+bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, const struct wsp_ggml_tensor * tensor);
+// reorder the nodes in the graph to improve concurrency, while respecting fusion
+//
+// note: this implementation is generic and not specific to metal
+//       if it proves to work well, we can start using it for other backends in the future
+void wsp_ggml_graph_optimize(struct wsp_ggml_cgraph * gf);
+#ifdef __cplusplus
+}
+#endif