npm - whisper.rn - Versions diffs - 0.5.2 → 0.5.3 - Mend

whisper.rn 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/cpp/ggml-cpu/ops.h CHANGED Viewed

@@ -34,6 +34,7 @@ void wsp_ggml_compute_forward_add1(const struct wsp_ggml_compute_params * params
 void wsp_ggml_compute_forward_acc(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_sum(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_sum_rows(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_cumsum(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_mean(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_argmax(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_count_equal(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
@@ -51,10 +52,6 @@ void wsp_ggml_compute_forward_scale(const struct wsp_ggml_compute_params * param
 void wsp_ggml_compute_forward_set(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_cpy(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_cont(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
-void wsp_ggml_compute_forward_reshape(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
-void wsp_ggml_compute_forward_view(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
-void wsp_ggml_compute_forward_permute(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
-void wsp_ggml_compute_forward_transpose(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_get_rows(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_get_rows_back(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_set_rows(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
@@ -85,6 +82,8 @@ void wsp_ggml_compute_forward_arange(const struct wsp_ggml_compute_params * para
 void wsp_ggml_compute_forward_timestep_embedding(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_argsort(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_leaky_relu(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_tri(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_fill(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_flash_attn_ext(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_flash_attn_back(
         const struct wsp_ggml_compute_params * params,
@@ -100,6 +99,7 @@ void wsp_ggml_compute_forward_get_rel_pos(const struct wsp_ggml_compute_params *
 void wsp_ggml_compute_forward_add_rel_pos(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_rwkv_wkv6(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_rwkv_wkv7(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_solve_tri(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_gla(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_map_custom1(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_map_custom2(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);

package/cpp/ggml-cpu/repack.cpp CHANGED Viewed

@@ -1600,6 +1600,55 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, wsp_ggml_type
         return false;
     }
+    void forward_mul_mat_one_chunk(wsp_ggml_compute_params * params,
+                                   wsp_ggml_tensor *         op,
+                                   int64_t               src0_start,
+                                   int64_t               src0_end,
+                                   int64_t               src1_start,
+                                   int64_t               src1_end) {
+        const wsp_ggml_tensor * src0 = op->src[0];
+        const wsp_ggml_tensor * src1 = op->src[1];
+        wsp_ggml_tensor *       dst  = op;
+        WSP_GGML_TENSOR_BINARY_OP_LOCALS
+        const size_t src1_col_stride = wsp_ggml_row_size(PARAM_TYPE, ne10);
+        WSP_GGML_ASSERT(ne03 == 1 && ne13 == 1);
+        WSP_GGML_ASSERT(ne12 % ne02 == 0);
+        const int64_t r2 = ne12 / ne02;
+        const int64_t i12 = src1_start / ne1;
+        const int64_t i11 = src1_start - i12 * ne1;
+        // Determine batch index
+        const int64_t i02 = i12 / r2;
+        const int64_t i1 = i11;
+        const int64_t i2 = i12;
+        const char * src0_ptr = (const char *) src0->data + i02 * nb02;
+        const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
+        char *       dst_ptr  = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
+        const int64_t nrows = src1_end - src1_start;
+        const int64_t ncols = src0_end - src0_start;
+        WSP_GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (nrows > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
+                                                             src0_ptr + src0_start * nb01, src1_ptr,
+                                                             nrows - (nrows % 4), ncols);
+        }
+        for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
+                                                             ne01, src0_ptr + src0_start * nb01,
+                                                             src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
+        }
+    }
     void forward_mul_mat(wsp_ggml_compute_params * params, wsp_ggml_tensor * op) {
         const wsp_ggml_tensor * src0 = op->src[0];
         const wsp_ggml_tensor * src1 = op->src[1];
@@ -1621,6 +1670,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, wsp_ggml_type
         WSP_GGML_ASSERT(nb1 <= nb2);
         WSP_GGML_ASSERT(nb2 <= nb3);
+        // TODO: General batched mul mat for 4D tensors
+        // Currently only supports 3D tensors
+        WSP_GGML_ASSERT(ne03 == 1);
+        WSP_GGML_ASSERT(ne13 == 1);
+        WSP_GGML_ASSERT(ne3 == 1);
         WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
         WSP_GGML_ASSERT(wsp_ggml_n_dims(op->src[0]) == 2);
@@ -1628,46 +1683,101 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, wsp_ggml_type
         char *       wdata = static_cast<char *>(params->wdata);
         const size_t nbw1  = wsp_ggml_row_size(PARAM_TYPE, ne10);
+        const size_t nbw2  = nbw1 * ne11;
-        assert(params->wsize >= nbw1 * ne11);
+        assert(params->wsize >= nbw2 * ne12);
         const wsp_ggml_from_float_t from_float = wsp_ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
-        int64_t i11_processed = 0;
-        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
-            wsp_ggml_wsp_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
-        }
+        // INFO: Quantization is done in planes to avoid extra complexity in chunking.
+        // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
+        // the planes are broadcast.
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            char * data_ptr  = (char *) src1->data + i12 * nb12;
+            char * wdata_ptr = wdata + i12 * nbw2;
+            for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+                wsp_ggml_wsp_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
+                                                            (void *) (wdata_ptr + i11 * nbw1), 4, ne10);
+            }
-        i11_processed = ne11 - ne11 % 4;
-        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
-            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
+            const int64_t i11_processed = ne11 - ne11 % 4;
+            for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+                from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
+            }
         }
-        wsp_ggml_barrier(params->threadpool);
+        // disable for NUMA
+        const bool disable_chunking = wsp_ggml_is_numa();
-        const void * src1_wdata      = params->wdata;
-        const size_t src1_col_stride = wsp_ggml_row_size(PARAM_TYPE, ne10);
-        int64_t      src0_start      = (ith * ne01) / nth;
-        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
-        src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
-        if (src0_start >= src0_end) {
-            return;
+        // 4x chunks per thread
+        const int64_t nr0 = wsp_ggml_nrows(op->src[0]);
+        int     nth_scaled  = nth * 4;
+        int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
+        int64_t nchunk0     = (nr0 + chunk_size0 - 1) / chunk_size0;
+        // src1 is chunked only by full planes.
+        // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
+        // to route them thorugh GEMV.
+        // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
+        // to avoid affecting their performance
+        int64_t nchunk1 = ne12;
+        // Ensure minimum chunk size to avoid alignment issues with high thread counts
+        // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
+        const int64_t min_chunk_size = NB_COLS;
+        if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
+            nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
         }
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (ne11 > 3) {
-            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        if (nth == 1 || nchunk0 < nth || disable_chunking) {
+            nchunk0 = nth;
         }
-        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
-            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                    src0_end - src0_start);
+        const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+        // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
+        // This prevents creating too many tiny chunks that could overlap after alignment
+        const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
+        nchunk0                  = MIN(nchunk0, max_nchunk);
+        if (ith == 0) {
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            wsp_ggml_threadpool_chunk_set(params->threadpool, nth);
+        }
+        wsp_ggml_barrier(params->threadpool);
+        // The first chunk comes from our thread_id, the rest will get auto-assigned.
+        int current_chunk = ith;
+        while (current_chunk < nchunk0 * nchunk1) {
+            const int64_t ith0 = current_chunk % nchunk0;
+            const int64_t ith1 = current_chunk / nchunk0;
+            int64_t src0_start = dr0 * ith0;
+            int64_t src0_end   = MIN(src0_start + dr0, nr0);
+            // full-plane range for src1
+            int64_t src1_start = ith1 * ne11;
+            int64_t src1_end = (ith1 + 1) * ne11;
+            // Align boundaries to NB_COLS - round up to ensure all data is included
+            // The chunk size limiting above ensures chunks are large enough to prevent overlaps
+            src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+            src0_end   = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
+            src0_end   = MIN(src0_end, ne01);
+            // Make sure current plane is the last one before exiting
+            if (src0_start >= src0_end) {
+                current_chunk = wsp_ggml_threadpool_chunk_add(params->threadpool, 1);
+                continue;
+            }
+            forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
+            current_chunk = wsp_ggml_threadpool_chunk_add(params->threadpool, 1);
         }
     }
@@ -1772,8 +1882,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, wsp_ggml_type
             int64_t src0_cur_start = (ith * ne01) / nth;
             int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            // Align boundaries to NB_COLS - round up to ensure all data is included
             src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
             src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
+            if (src0_cur_end > ne01) {
+                src0_cur_end = ne01;
+            }
             if (src0_cur_start >= src0_cur_end) {
                 return;

package/cpp/ggml-cpu/simd-mappings.h CHANGED Viewed

@@ -956,7 +956,7 @@ do {                                                              \
 #define WSP_GGML_F32Cx8          __m256
 #define WSP_GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
-#define WSP_GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+#define WSP_GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
 static inline __m256 __lasx_f32cx8_load(const wsp_ggml_fp16_t * x) {
     __m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
 #define WSP_GGML_F32x4         __m128
 #define WSP_GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
-#define WSP_GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define WSP_GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
 #define WSP_GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
 #define WSP_GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define WSP_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define WSP_GGML_F32x4_ADD     __lsx_vfadd_s
 #define WSP_GGML_F32x4_MUL     __lsx_vfmul_s
-#define WSP_GGML_F32x4_REDUCE(res, x)                                                     \
-{                                                                                     \
-    int offset = WSP_GGML_F32_ARR >> 1;                                                   \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88);                                     \
-    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    res             = (wsp_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
+#define WSP_GGML_F32x4_REDUCE(res, x)                               \
+{                                                               \
+    int offset = WSP_GGML_F32_ARR >> 1;                             \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    offset >>= 1;                                               \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    offset >>= 1;                                               \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
+    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
+    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
+    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
+    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
+    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
+    res = (wsp_ggml_float) ((v4f32)t5)[0];                          \
 }
 #define WSP_GGML_F32_VEC        WSP_GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
 #define WSP_GGML_F32Cx4             __m128
 #define WSP_GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
-#define WSP_GGML_F32Cx4_SET1(x)     (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define WSP_GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
 #define WSP_GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
 #define WSP_GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 #define WSP_GGML_F32Cx4_FMA         WSP_GGML_F32x4_FMA

package/cpp/ggml-cpu/unary-ops.cpp CHANGED Viewed

@@ -73,6 +73,14 @@ static inline float op_log(float x) {
     return logf(x);
 }
+static inline float op_expm1(float x) {
+    return expf(x) - 1.0f;
+}
+static inline float op_softplus(float x) {
+    return (x > 20.0f) ? x : logf(1.0f + expf(x));
+}
 static inline float op_floor(float x) {
     return floorf(x);
 }
@@ -290,6 +298,14 @@ void wsp_ggml_compute_forward_log(const wsp_ggml_compute_params * params, wsp_gg
     unary_op<op_log>(params, dst);
 }
+void wsp_ggml_compute_forward_expm1(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    unary_op<op_expm1>(params, dst);
+}
+void wsp_ggml_compute_forward_softplus(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    unary_op<op_softplus>(params, dst);
+}
 void wsp_ggml_compute_forward_floor(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
     unary_op<op_floor>(params, dst);
 }

package/cpp/ggml-cpu/unary-ops.h CHANGED Viewed

@@ -22,6 +22,8 @@ void wsp_ggml_compute_forward_sqrt(const struct wsp_ggml_compute_params * params
 void wsp_ggml_compute_forward_sin(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_cos(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_log(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_expm1(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_softplus(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_floor(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_ceil(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_round(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);

package/cpp/ggml-cpu/vec.cpp CHANGED Viewed

@@ -360,6 +360,13 @@ void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) {
     for (; i + 3 < n; i += 4) {
         vst1q_f32(y + i, wsp_ggml_v_silu(vld1q_f32(x + i)));
     }
+#elif defined(__riscv_v_intrinsic)
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
+        vfloat32m2_t vy = wsp_ggml_v_silu_m2(vx, vl);
+        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+    }
 #endif
     for (; i < n; ++i) {
         y[i] = wsp_ggml_silu_f32(x[i]);
@@ -460,6 +467,16 @@ wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, co
         val = vec_mul(val, val);
         sum += (wsp_ggml_float)vec_hsum_f32x4(val);
     }
+#elif defined(__riscv_v_intrinsic)
+    vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t val = __riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], vl), mean, vl);
+        __riscv_vse32_v_f32m2(&y[i], val, vl);
+        val = __riscv_vfmul_vv_f32m2(val, val, vl);
+        vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, vl);
+    }
+    sum = (wsp_ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
 #endif
     for (; i < n; ++i) {
         float val = x[i] - mean;

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -1416,6 +1416,16 @@ inline static void wsp_ggml_vec_sum_f32(const int n, float * s, const float * x)
 #endif
 }
+inline static void wsp_ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        if (i == 0) {
+            y[i] = x[i];
+        } else {
+            y[i] = y[i - 1] + x[i];
+        }
+    }
+}
 inline static void wsp_ggml_vec_sum_f32_ggf(const int n, wsp_ggml_float * s, const float * x) {
     wsp_ggml_float sum = 0.0;
     for (int i = 0; i < n; ++i) {

package/cpp/ggml-impl.h CHANGED Viewed

@@ -102,7 +102,7 @@ static bool wsp_ggml_op_is_empty(enum wsp_ggml_op op) {
     }
 }
-static inline float wsp_ggml_softplus(float input) {
+static inline float wsp_ggml_compute_softplus_f32(float input) {
     return (input > 20.0f) ? input : logf(1 + expf(input));
 }
 //
@@ -682,6 +682,7 @@ static inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph * cgr
 #endif
 #ifdef __cplusplus
+#include <array>
 #include <initializer_list>
 #include <vector>
@@ -697,6 +698,21 @@ inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph *          c
     return wsp_ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
 }
+// Return true if the edges in the graph match expectations.
+inline bool wsp_ggml_check_edges(const struct wsp_ggml_cgraph *                cgraph,
+                             int                                       start_idx,
+                             std::initializer_list<std::array<int, 3>> edges) {
+    for (const auto & edge : edges) {
+        int dst_node = edge[0];
+        int src_idx  = edge[1];
+        int src_node = edge[2];
+        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
+            return false;
+        }
+    }
+    return true;
+}
 // expose GGUF internals for test code
 WSP_GGML_API size_t wsp_gguf_type_size(enum wsp_gguf_type type);
 WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_gguf_init_params params);

package/cpp/ggml-metal/ggml-metal-context.m CHANGED Viewed

@@ -35,7 +35,6 @@ struct wsp_ggml_metal {
     // additional, inference-time compiled pipelines
     wsp_ggml_metal_pipelines_t pipelines_ext;
-    bool use_bfloat;
     bool use_fusion;
     bool use_concurrency;
     bool use_graph_optimize;
@@ -121,11 +120,10 @@ wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev) {
         }
     }
-    const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
+    //const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-    res->use_bfloat      = props_dev->has_bfloat;
     res->use_fusion      = getenv("WSP_GGML_METAL_FUSION_DISABLE") == nil;
     res->use_concurrency = getenv("WSP_GGML_METAL_CONCURRENCY_DISABLE") == nil;
@@ -147,7 +145,6 @@ wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev) {
     memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
-    WSP_GGML_LOG_INFO("%s: use bfloat         = %s\n", __func__, res->use_bfloat         ? "true" : "false");
     WSP_GGML_LOG_INFO("%s: use fusion         = %s\n", __func__, res->use_fusion         ? "true" : "false");
     WSP_GGML_LOG_INFO("%s: use concurrency    = %s\n", __func__, res->use_concurrency    ? "true" : "false");
     WSP_GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
@@ -292,7 +289,7 @@ void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tenso
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [encoder copyFromBuffer:buf_src
@@ -303,6 +300,7 @@ void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tenso
         [encoder endEncoding];
         [cmd_buf commit];
+        [buf_src release];
         // do not wait here for completion
         //[cmd_buf waitUntilCompleted];
@@ -333,7 +331,7 @@ void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [encoder copyFromBuffer:bid_src.metal
@@ -344,6 +342,7 @@ void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml
         [encoder endEncoding];
         [cmd_buf commit];
+        [buf_dst release];
         // do not wait here for completion
         //[cmd_buf waitUntilCompleted];