npm - whisper.rn - Versions diffs - 0.5.1 → 0.5.3 - Mend

whisper.rn 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -44,6 +44,7 @@ void wsp_ggml_vec_dot_bf16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_gg
 void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, size_t bx, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, size_t by, int nrc);
 void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x);
+wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
 wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
 wsp_ggml_float wsp_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
@@ -143,14 +144,14 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
         for (int i = 0; i < np; i += wsp_ggml_f16_step) {
             ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0); // 8 elements
-            ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elemnst
+            ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
             sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
             ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
             sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
             ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1); // next 8 elements
-            ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 ekements
+            ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 elements
             sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
             ax2 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 1*wsp_ggml_f16_epr, 1);
             sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
@@ -159,7 +160,7 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
             ax3 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 2*wsp_ggml_f16_epr, 2);
             sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
-            ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
+            ax3 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
             sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
             ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
@@ -654,11 +655,11 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
         }
         // leftovers
         // maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
-        if (np < n) {
-            svbool_t pg = svwhilelt_b32(np, n);
-            ay1 = svld1_f32(pg, y + np);
+        for (int i = np; i < n; i += wsp_ggml_f32_epr) {
+            svbool_t pg = svwhilelt_b32(i, n);
+            ay1 = svld1_f32(pg, y + i);
             ay1 = svmul_f32_m(pg, ay1, vx);
-            svst1_f32(pg, y + np, ay1);
+            svst1_f32(pg, y + i, ay1);
         }
     #elif defined(__riscv_v_intrinsic)
         for (int i = 0, avl; i < n; i += avl) {
@@ -819,7 +820,8 @@ inline static void wsp_ggml_vec_tanh_f16 (const int n, wsp_ggml_fp16_t * y, cons
 inline static void wsp_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void wsp_ggml_vec_elu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = WSP_GGML_CPU_FP32_TO_FP16(expm1f(WSP_GGML_CPU_FP16_TO_FP32(x[i])));
+        const float v = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
     }
 }
 inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
@@ -1414,6 +1416,16 @@ inline static void wsp_ggml_vec_sum_f32(const int n, float * s, const float * x)
 #endif
 }
+inline static void wsp_ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        if (i == 0) {
+            y[i] = x[i];
+        } else {
+            y[i] = y[i - 1] + x[i];
+        }
+    }
+}
 inline static void wsp_ggml_vec_sum_f32_ggf(const int n, wsp_ggml_float * s, const float * x) {
     wsp_ggml_float sum = 0.0;
     for (int i = 0; i < n; ++i) {

package/cpp/ggml-impl.h CHANGED Viewed

@@ -102,6 +102,9 @@ static bool wsp_ggml_op_is_empty(enum wsp_ggml_op op) {
     }
 }
+static inline float wsp_ggml_compute_softplus_f32(float input) {
+    return (input > 20.0f) ? input : logf(1 + expf(input));
+}
 //
 // logging
 //
@@ -562,14 +565,23 @@ static inline wsp_ggml_bf16_t wsp_ggml_compute_fp32_to_bf16(float s) {
 #define WSP_GGML_FP32_TO_BF16(x) wsp_ggml_compute_fp32_to_bf16(x)
 #define WSP_GGML_BF16_TO_FP32(x) wsp_ggml_compute_bf16_to_fp32(x)
+static inline int32_t wsp_ggml_node_get_use_count(const struct wsp_ggml_cgraph * cgraph, int node_idx) {
+    const struct wsp_ggml_tensor * node = cgraph->nodes[node_idx];
+    size_t hash_pos = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
+    if (!wsp_ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos)) {
+        return 0;
+    }
+    return cgraph->use_counts[hash_pos];
+}
 // return true if the node's results are only used by N other nodes
 // and can be fused into their calculations.
 static inline bool wsp_ggml_node_has_n_uses(const struct wsp_ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
     const struct wsp_ggml_tensor * node = cgraph->nodes[node_idx];
     // check the use count against how many we're replacing
-    size_t hash_pos = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
-    if (!wsp_ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) {
+    if (wsp_ggml_node_get_use_count(cgraph, node_idx) != n_uses) {
         return false;
     }
@@ -635,11 +647,42 @@ static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int
     return wsp_ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
 }
+WSP_GGML_API bool wsp_ggml_can_fuse_subgraph_ext(const struct wsp_ggml_cgraph * cgraph,
+                                         const int *                node_idxs,
+                                         int                        count,
+                                         const enum wsp_ggml_op *       ops,
+                                         const int *                outputs,
+                                         int                        num_outputs);
+// Returns true if the subgraph formed by {node_idxs} can be fused
+// checks whethers all nodes which are not part of outputs can be elided
+// by checking if their num_uses are confined to the subgraph
+static inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph * cgraph,
+                                          int                        node_idx,
+                                          int                        count,
+                                          const enum wsp_ggml_op *       ops,
+                                          const int *                outputs,
+                                          int                        num_outputs) {
+    WSP_GGML_ASSERT(count < 32);
+    if (node_idx + count > cgraph->n_nodes) {
+        return false;
+    }
+    int idxs[32];
+    for (int i = 0; i < count; ++i) {
+        idxs[i] = node_idx + i;
+    }
+    return wsp_ggml_can_fuse_subgraph_ext(cgraph, idxs, count, ops, outputs, num_outputs);
+}
 #ifdef __cplusplus
 }
 #endif
 #ifdef __cplusplus
+#include <array>
 #include <initializer_list>
 #include <vector>
@@ -648,6 +691,28 @@ inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_id
     return wsp_ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size());
 }
+inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph *          cgraph,
+                                   int                                 start_idx,
+                                   std::initializer_list<enum wsp_ggml_op> ops,
+                                   std::initializer_list<int>          outputs = {}) {
+    return wsp_ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
+}
+// Return true if the edges in the graph match expectations.
+inline bool wsp_ggml_check_edges(const struct wsp_ggml_cgraph *                cgraph,
+                             int                                       start_idx,
+                             std::initializer_list<std::array<int, 3>> edges) {
+    for (const auto & edge : edges) {
+        int dst_node = edge[0];
+        int src_idx  = edge[1];
+        int src_node = edge[2];
+        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
+            return false;
+        }
+    }
+    return true;
+}
 // expose GGUF internals for test code
 WSP_GGML_API size_t wsp_gguf_type_size(enum wsp_gguf_type type);
 WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_gguf_init_params params);

package/cpp/ggml-metal/ggml-metal-common.cpp CHANGED Viewed

@@ -112,7 +112,7 @@ static bool wsp_ggml_mem_ranges_add_dst(wsp_ggml_mem_ranges_t mrs, const wsp_ggm
 }
 bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
-    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+    for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
         if (tensor->src[i]) {
             wsp_ggml_mem_ranges_add_src(mrs, tensor->src[i]);
         }
@@ -173,7 +173,7 @@ static bool wsp_ggml_mem_ranges_check_dst(wsp_ggml_mem_ranges_t mrs, const wsp_g
 }
 bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
-    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+    for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
         if (tensor->src[i]) {
             if (!wsp_ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
                 return false;

package/cpp/ggml-metal/ggml-metal-context.m CHANGED Viewed

@@ -35,7 +35,6 @@ struct wsp_ggml_metal {
     // additional, inference-time compiled pipelines
     wsp_ggml_metal_pipelines_t pipelines_ext;
-    bool use_bfloat;
     bool use_fusion;
     bool use_concurrency;
     bool use_graph_optimize;
@@ -121,11 +120,10 @@ wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev) {
         }
     }
-    const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
+    //const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-    res->use_bfloat      = props_dev->has_bfloat;
     res->use_fusion      = getenv("WSP_GGML_METAL_FUSION_DISABLE") == nil;
     res->use_concurrency = getenv("WSP_GGML_METAL_CONCURRENCY_DISABLE") == nil;
@@ -147,7 +145,6 @@ wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev) {
     memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
-    WSP_GGML_LOG_INFO("%s: use bfloat         = %s\n", __func__, res->use_bfloat         ? "true" : "false");
     WSP_GGML_LOG_INFO("%s: use fusion         = %s\n", __func__, res->use_fusion         ? "true" : "false");
     WSP_GGML_LOG_INFO("%s: use concurrency    = %s\n", __func__, res->use_concurrency    ? "true" : "false");
     WSP_GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
@@ -292,7 +289,7 @@ void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tenso
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [encoder copyFromBuffer:buf_src
@@ -303,6 +300,7 @@ void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tenso
         [encoder endEncoding];
         [cmd_buf commit];
+        [buf_src release];
         // do not wait here for completion
         //[cmd_buf waitUntilCompleted];
@@ -333,7 +331,7 @@ void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [encoder copyFromBuffer:bid_src.metal
@@ -344,6 +342,7 @@ void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml
         [encoder endEncoding];
         [cmd_buf commit];
+        [buf_dst release];
         // do not wait here for completion
         //[cmd_buf waitUntilCompleted];