npm - whisper.rn - Versions diffs - 0.5.0-rc.9 → 0.5.0 - Mend

whisper.rn 0.5.0-rc.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -22,21 +22,6 @@ static bool wsp_ggml_is_view(const struct wsp_ggml_tensor * t) {
     return t->view_src != NULL;
 }
-static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
-}
 // ops that return true for this function must not use restrict pointers for their backend implementations
 static bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
     switch (op) {
@@ -44,6 +29,7 @@ static bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op) {
         case WSP_GGML_OP_DIAG_MASK_ZERO:
         case WSP_GGML_OP_DIAG_MASK_INF:
         case WSP_GGML_OP_ADD:
+        case WSP_GGML_OP_ADD_ID:
         case WSP_GGML_OP_ADD1:
         case WSP_GGML_OP_SUB:
         case WSP_GGML_OP_MUL:

package/cpp/ggml-backend-reg.cpp CHANGED Viewed

@@ -45,6 +45,14 @@
 #include "ggml-vulkan.h"
 #endif
+#ifdef WSP_GGML_USE_WEBGPU
+#include "ggml-webgpu.h"
+#endif
+#ifdef WSP_GGML_USE_ZDNN
+#include "ggml-zdnn.h"
+#endif
 #ifdef WSP_GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -61,10 +69,6 @@
 #include "ggml-cann.h"
 #endif
-#ifdef WSP_GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
-#endif
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -177,6 +181,12 @@ struct wsp_ggml_backend_registry {
 #ifdef WSP_GGML_USE_VULKAN
         register_backend(wsp_ggml_backend_vk_reg());
 #endif
+#ifdef WSP_GGML_USE_WEBGPU
+        register_backend(wsp_ggml_backend_webgpu_reg());
+#endif
+#ifdef WSP_GGML_USE_ZDNN
+        register_backend(wsp_ggml_backend_zdnn_reg());
+#endif
 #ifdef WSP_GGML_USE_OPENCL
         register_backend(wsp_ggml_backend_opencl_reg());
 #endif
@@ -189,9 +199,6 @@ struct wsp_ggml_backend_registry {
 #ifdef WSP_GGML_USE_RPC
         register_backend(wsp_ggml_backend_rpc_reg());
 #endif
-#ifdef WSP_GGML_USE_KOMPUTE
-        register_backend(wsp_ggml_backend_kompute_reg());
-#endif
 #ifdef WSP_GGML_USE_CPU
         register_backend(wsp_ggml_backend_cpu_reg());
 #endif
@@ -498,6 +505,9 @@ static wsp_ggml_backend_reg_t wsp_ggml_backend_load_best(const char * name, bool
     std::vector<fs::path> search_paths;
     if (user_search_path == nullptr) {
+#ifdef WSP_GGML_BACKEND_DIR
+        search_paths.push_back(fs::u8path(WSP_GGML_BACKEND_DIR));
+#endif
         // default search paths: executable directory, current directory
         search_paths.push_back(get_executable_path());
         search_paths.push_back(fs::current_path());
@@ -575,7 +585,6 @@ void wsp_ggml_backend_load_all_from_path(const char * dir_path) {
     wsp_ggml_backend_load_best("cann", silent, dir_path);
     wsp_ggml_backend_load_best("cuda", silent, dir_path);
     wsp_ggml_backend_load_best("hip", silent, dir_path);
-    wsp_ggml_backend_load_best("kompute", silent, dir_path);
     wsp_ggml_backend_load_best("metal", silent, dir_path);
     wsp_ggml_backend_load_best("rpc", silent, dir_path);
     wsp_ggml_backend_load_best("sycl", silent, dir_path);

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -352,21 +352,6 @@ wsp_ggml_backend_dev_t wsp_ggml_backend_get_device(wsp_ggml_backend_t backend) {
 // backend copy
-static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
-}
 void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
     WSP_GGML_ASSERT(wsp_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
@@ -662,6 +647,7 @@ struct wsp_ggml_backend_sched {
     // pipeline parallelism support
     int n_copies;
     int cur_copy;
+    int next_copy;
     wsp_ggml_backend_event_t events[WSP_GGML_SCHED_MAX_BACKENDS][WSP_GGML_SCHED_MAX_COPIES];
     struct wsp_ggml_tensor * graph_inputs[WSP_GGML_SCHED_MAX_SPLIT_INPUTS];
     int n_graph_inputs;
@@ -1085,6 +1071,11 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
                 }
             }
         }
+        // if the node is still unassigned, assign it to the first backend that supports it
+        for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
+            wsp_ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
+        }
+        WSP_GGML_ASSERT(*cur_backend_id != -1);
     }
     // pass 5: split graph, find tensors that need to be copied
@@ -1112,7 +1103,7 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
             const int node_backend_id = tensor_backend_id(node);
-            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
+            WSP_GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
@@ -1170,7 +1161,7 @@ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, s
                 size_t src_id = hash_id(src);
                 const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
-                assert(src_backend_id != -1); // all inputs should be assigned by now
+                WSP_GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
                 if (src->flags & WSP_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                     if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
@@ -1448,8 +1439,6 @@ static enum wsp_ggml_status wsp_ggml_backend_sched_compute_splits(wsp_ggml_backe
         }
     }
-    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
     return WSP_GGML_STATUS_SUCCESS;
 }
@@ -1550,10 +1539,10 @@ void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
 bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
     WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
-    wsp_ggml_backend_sched_split_graph(sched, measure_graph);
     wsp_ggml_backend_sched_synchronize(sched);
+    wsp_ggml_backend_sched_split_graph(sched, measure_graph);
     if (!wsp_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
         return false;
     }
@@ -1565,6 +1554,10 @@ bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_g
 bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
     WSP_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
+    WSP_GGML_ASSERT(!sched->is_alloc);
+    sched->cur_copy = sched->next_copy;
+    sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
     wsp_ggml_backend_sched_split_graph(sched, graph);
@@ -1605,7 +1598,7 @@ void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
         // if the graph is not already allocated, always use copy 0 after a synchronization
         // this ensures that during generation the same copy is used every time,
         // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
-        sched->cur_copy = 0;
+        sched->next_copy = 0;
     }
 }

package/cpp/ggml-common.h CHANGED Viewed

@@ -99,6 +99,9 @@ typedef sycl::half2 wsp_ggml_half2;
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 #define QR4_1 2
+#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
+#define QR_MXFP4 2
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2
@@ -184,6 +187,13 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(wsp_ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
+#define QK_MXFP4 32
+typedef struct {
+    uint8_t e; // E8M0
+    uint8_t qs[QK_MXFP4/2];
+} block_mxfp4;
+static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
 #define QK5_0 32
 typedef struct {
     wsp_ggml_half d;           // delta
@@ -1074,10 +1084,17 @@ WSP_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 WSP_GGML_TABLE_END()
+// TODO: fix name to kvalues_iq4_nl
 WSP_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
     -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 WSP_GGML_TABLE_END()
+// e2m1 values (doubled)
+// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+WSP_GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
+    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
+WSP_GGML_TABLE_END()
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f