npm - whisper.rn - Versions diffs - 0.5.1 → 0.5.2 - Mend

whisper.rn 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

package/android/src/main/jni.cpp CHANGED Viewed

@@ -276,6 +276,7 @@ JNIEXPORT jlong JNICALL
 Java_com_rnwhisper_WhisperContext_initContextWithAsset(
     JNIEnv *env,
     jobject thiz,
+    jint context_id,
     jobject asset_manager,
     jstring model_path_str
 ) {
@@ -290,6 +291,7 @@ Java_com_rnwhisper_WhisperContext_initContextWithAsset(
     const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
     context = whisper_init_from_asset(env, asset_manager, model_path_chars, cparams);
     env->ReleaseStringUTFChars(model_path_str, model_path_chars);
+    rnwhisper_jsi::addContext(context_id, reinterpret_cast<jlong>(context));
     return reinterpret_cast<jlong>(context);
 }
@@ -297,6 +299,7 @@ JNIEXPORT jlong JNICALL
 Java_com_rnwhisper_WhisperContext_initContextWithInputStream(
     JNIEnv *env,
     jobject thiz,
+    jint context_id,
     jobject input_stream
 ) {
     UNUSED(thiz);
@@ -308,6 +311,7 @@ Java_com_rnwhisper_WhisperContext_initContextWithInputStream(
     struct whisper_context *context = nullptr;
     context = whisper_init_from_input_stream(env, input_stream, cparams);
+    rnwhisper_jsi::addContext(context_id, reinterpret_cast<jlong>(context));
     return reinterpret_cast<jlong>(context);
 }
@@ -421,8 +425,9 @@ Java_com_rnwhisper_WhisperContext_fullWithNewJob(
     LOGI("About to reset timings");
     whisper_reset_timings(context);
-    LOGI("About to run whisper_full");
-    int code = whisper_full(context, params, audio_data_arr, audio_data_len);
+    int n_processors = readablemap::getInt(env, options, "nProcessors", 1);
+    LOGI("About to run whisper_full_parallel with n_processors=%d", n_processors);
+    int code = whisper_full_parallel(context, params, audio_data_arr, audio_data_len, n_processors);
     if (code == 0) {
         // whisper_print_timings(context);
     }
@@ -441,8 +446,11 @@ Java_com_rnwhisper_WhisperContext_createRealtimeTranscribeJob(
     jlong context_ptr,
     jobject options
 ) {
+    UNUSED(thiz);
+    UNUSED(context_ptr);
     whisper_full_params params = createFullParams(env, options);
     rnwhisper::job* job = rnwhisper::job_new(job_id, params);
+    job->n_processors = readablemap::getInt(env, options, "nProcessors", 1);
     rnwhisper::vad_params vad;
     vad.use_vad = readablemap::getBool(env, options, "useVad", false);
     vad.vad_ms = readablemap::getInt(env, options, "vadMs", 2000);
@@ -534,11 +542,12 @@ Java_com_rnwhisper_WhisperContext_fullWithJob(
     jint n_samples
 ) {
     UNUSED(thiz);
+    UNUSED(env);
     struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
     rnwhisper::job* job = rnwhisper::job_get(job_id);
     float* pcmf32 = job->pcm_slice_to_f32(slice_index, n_samples);
-    int code = whisper_full(context, job->params, pcmf32, n_samples);
+    int code = whisper_full_parallel(context, job->params, pcmf32, n_samples, job->n_processors);
     free(pcmf32);
     if (code == 0) {
         // whisper_print_timings(context);

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -392,12 +392,8 @@ static void wsp_ggml_dyn_tallocr_free(struct wsp_ggml_dyn_tallocr * alloc) {
     free(alloc);
 }
-static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc) {
-    size_t max_size = 0;
-    for (int i = 0; i < alloc->n_chunks; i++) {
-        max_size += alloc->chunks[i]->max_size;
-    }
-    return max_size;
+static size_t wsp_ggml_dyn_tallocr_max_size(struct wsp_ggml_dyn_tallocr * alloc, int chunk) {
+    return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
 }
@@ -417,10 +413,8 @@ static void wsp_ggml_vbuffer_free(struct vbuffer * buf) {
     free(buf);
 }
-static int wsp_ggml_vbuffer_n_chunks(struct vbuffer * buf) {
-    int n = 0;
-    while (n < WSP_GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
-    return n;
+static size_t wsp_ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
+    return buf->chunks[chunk] ? wsp_ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
 }
 static size_t wsp_ggml_vbuffer_size(struct vbuffer * buf) {
@@ -604,6 +598,26 @@ static bool wsp_ggml_gallocr_is_allocated(wsp_ggml_gallocr_t galloc, struct wsp_
     return t->data != NULL || wsp_ggml_gallocr_hash_get(galloc, t)->allocated;
 }
+// free the extra space at the end if the new tensor is smaller
+static void wsp_ggml_gallocr_free_extra_space(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * node, struct wsp_ggml_tensor * parent) {
+    struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
+    struct hash_node * p_hn = wsp_ggml_gallocr_hash_get(galloc, parent);
+    size_t parent_size = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
+    size_t node_size = wsp_ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+    WSP_GGML_ASSERT(parent_size >= node_size);
+    if (parent_size > node_size) {
+        struct wsp_ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+        struct buffer_address p_addr = p_hn->addr;
+        p_addr.offset += node_size;
+        size_t extra_size = parent_size - node_size;
+        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
+        wsp_ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+    }
+}
 static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * node, int buffer_id) {
     WSP_GGML_ASSERT(buffer_id >= 0);
     struct hash_node * hn = wsp_ggml_gallocr_hash_get(galloc, node);
@@ -649,6 +663,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
                             hn->addr = p_hn->addr;
                             p_hn->allocated = false; // avoid freeing the parent
                             view_src_hn->allocated = false;
+                            wsp_ggml_gallocr_free_extra_space(galloc, node, view_src);
                             return;
                         }
                     } else {
@@ -656,6 +671,7 @@ static void wsp_ggml_gallocr_allocate_node(wsp_ggml_gallocr_t galloc, struct wsp
                         hn->buffer_id = p_hn->buffer_id;
                         hn->addr = p_hn->addr;
                         p_hn->allocated = false; // avoid freeing the parent
+                        wsp_ggml_gallocr_free_extra_space(galloc, node, parent);
                         return;
                     }
                 }
@@ -885,12 +901,20 @@ bool wsp_ggml_gallocr_reserve_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgrap
             }
         }
-        size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
-        size_t new_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
-        if (new_size > cur_size || galloc->buffers[i] == NULL) {
+        bool realloc = galloc->buffers[i] == NULL;
+        size_t new_size = 0;
+        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
+            size_t cur_chunk_size = galloc->buffers[i] ? wsp_ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
+            size_t new_chunk_size = wsp_ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
+            new_size += new_chunk_size;
+            if (new_chunk_size > cur_chunk_size) {
+                realloc = true;
+            }
+        }
+        if (realloc) {
 #ifndef NDEBUG
+            size_t cur_size = galloc->buffers[i] ? wsp_ggml_vbuffer_size(galloc->buffers[i]) : 0;
             WSP_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, wsp_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

package/cpp/ggml-backend-impl.h CHANGED Viewed

@@ -209,9 +209,6 @@ extern "C" {
         void * context;
     };
-    // Internal backend registry API
-    WSP_GGML_API void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg);
     // Add backend dynamic loading support to the backend
     // Initialize the backend

package/cpp/ggml-backend.h CHANGED Viewed

@@ -215,6 +215,8 @@ extern "C" {
     // Backend registry
     //
+    WSP_GGML_API void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg);
     WSP_GGML_API void wsp_ggml_backend_device_register(wsp_ggml_backend_dev_t device);
     // Backend (reg) enumeration

package/cpp/ggml-cpu/amx/amx.cpp CHANGED Viewed

@@ -149,6 +149,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
         if (op->op == WSP_GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
             is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
             op->src[0]->buffer && op->src[0]->buffer->buft == wsp_ggml_backend_amx_buffer_type() &&
+            op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
             op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
             (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == WSP_GGML_TYPE_F16))) {
             // src1 must be host buffer

package/cpp/ggml-cpu/ggml-cpu-impl.h CHANGED Viewed

@@ -68,7 +68,7 @@ struct wsp_ggml_compute_params {
 #endif  // __VXE2__
 #endif  // __s390x__ && __VEC__
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && defined(__linux__)
 #include <sys/prctl.h>
 #endif

package/cpp/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -689,8 +689,13 @@ bool wsp_ggml_is_numa(void) {
 #endif
 static void wsp_ggml_init_arm_arch_features(void) {
-#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
+#if defined(__linux__)
     wsp_ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#else
+    // TODO: add support of SVE for non-linux systems
+#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
+#endif
 #endif
 }
@@ -2179,6 +2184,10 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 case WSP_GGML_UNARY_OP_HARDSWISH:
                 case WSP_GGML_UNARY_OP_HARDSIGMOID:
                 case WSP_GGML_UNARY_OP_EXP:
+                case WSP_GGML_UNARY_OP_FLOOR:
+                case WSP_GGML_UNARY_OP_CEIL:
+                case WSP_GGML_UNARY_OP_ROUND:
+                case WSP_GGML_UNARY_OP_TRUNC:
                     {
                         n_tasks = 1;
                     } break;
@@ -2187,6 +2196,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 case WSP_GGML_UNARY_OP_GELU_ERF:
                 case WSP_GGML_UNARY_OP_GELU_QUICK:
                 case WSP_GGML_UNARY_OP_SILU:
+                case WSP_GGML_UNARY_OP_XIELU:
                     {
                         n_tasks = n_threads;
                     } break;
@@ -3557,13 +3567,17 @@ void wsp_ggml_cpu_init(void) {
 #ifdef WSP_GGML_USE_OPENMP
             //if (!getenv("OMP_WAIT_POLICY")) {
             //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
+            //    setenv("OMP_WAIT_POLICY", "active", 0)
             //}
             if (!getenv("KMP_BLOCKTIME")) {
                 // set the time to wait before sleeping a thread
                 // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
+#ifdef _WIN32
+                _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
+#else
+                setenv("KMP_BLOCKTIME", "200", 0); // 200ms
+#endif
             }
 #endif
         }

package/cpp/ggml-cpu/ops.cpp CHANGED Viewed

@@ -3467,31 +3467,27 @@ static void wsp_ggml_compute_forward_norm_f32(
     WSP_GGML_ASSERT(eps >= 0.0f);
-    // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
             for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                 const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                wsp_ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (wsp_ggml_float)x[i00];
-                }
+                float sum = 0.0;
+                wsp_ggml_vec_sum_f32(ne00, &sum, x);
                 float mean = sum/ne00;
                 float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+                float variance = 0;
-                wsp_ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (wsp_ggml_float)(v*v);
-                }
+#ifdef WSP_GGML_USE_ACCELERATE
+                mean = -mean;
+                vDSP_vsadd(x, 1, &mean, y, 1, ne00);
+                vDSP_measqv(y, 1, &variance, ne00);
+#else
+                variance = wsp_ggml_vec_cvar_f32(ne00, y, x, mean);
+#endif //WSP_GGML_USE_ACCELERATE
-                float variance = sum2/ne00;
                 const float scale = 1.0f/sqrtf(variance + eps);
                 wsp_ggml_vec_scale_f32(ne00, y, scale);
             }
         }
@@ -8135,7 +8131,7 @@ static void wsp_ggml_compute_forward_flash_attn_ext_f16(
         }
         // V /= S
-        const float S_inv = 1.0f/S;
+        const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
         wsp_ggml_vec_scale_f32(DV, VKQ32, S_inv);
         // dst indices
@@ -8637,7 +8633,7 @@ static void wsp_ggml_compute_forward_ssm_scan_f32(
                 // n_head
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const float dt_soft_plus = wsp_ggml_softplus(dt[h]);
                     const float dA = expf(dt_soft_plus * A[h]);
                     const int g = h / (nh / ng); // repeat_interleave
@@ -8734,7 +8730,7 @@ static void wsp_ggml_compute_forward_ssm_scan_f32(
                 // n_head
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const float dt_soft_plus = wsp_ggml_softplus(dt[h]);
                     const int g = h / (nh / ng); // repeat_interleave
                     // dim
@@ -8997,6 +8993,26 @@ void wsp_ggml_compute_forward_unary(
             {
                 wsp_ggml_compute_forward_exp(params, dst);
             } break;
+        case WSP_GGML_UNARY_OP_FLOOR:
+            {
+                wsp_ggml_compute_forward_floor(params, dst);
+            } break;
+        case WSP_GGML_UNARY_OP_CEIL:
+            {
+                wsp_ggml_compute_forward_ceil(params, dst);
+            } break;
+        case WSP_GGML_UNARY_OP_ROUND:
+            {
+                wsp_ggml_compute_forward_round(params, dst);
+            } break;
+        case WSP_GGML_UNARY_OP_TRUNC:
+            {
+                wsp_ggml_compute_forward_trunc(params, dst);
+            } break;
+        case WSP_GGML_UNARY_OP_XIELU:
+            {
+                wsp_ggml_compute_forward_xielu(params, dst);
+            } break;
         default:
             {
                 WSP_GGML_ABORT("fatal error");

package/cpp/ggml-cpu/unary-ops.cpp CHANGED Viewed

@@ -52,6 +52,15 @@ static inline float op_sqrt(float x) {
     return sqrtf(x);
 }
+static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
+    if (x > 0.0f) {
+        return alpha_p * x * x + beta * x;
+    } else {
+        const float min_x_eps = fminf(x, eps);
+        return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
+    }
+}
 static inline float op_sin(float x) {
     return sinf(x);
 }
@@ -64,6 +73,22 @@ static inline float op_log(float x) {
     return logf(x);
 }
+static inline float op_floor(float x) {
+    return floorf(x);
+}
+static inline float op_ceil(float x) {
+    return ceilf(x);
+}
+static inline float op_round(float x) {
+    return roundf(x);
+}
+static inline float op_trunc(float x) {
+    return truncf(x);
+}
 template <float (*op)(float), typename src0_t, typename dst_t>
 static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
     constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
@@ -121,6 +146,86 @@ static void unary_op(const wsp_ggml_compute_params * params, wsp_ggml_tensor * d
     }
 }
+template <float (*op)(float, wsp_ggml_tensor *)>
+static void unary_op_params(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    /*  */ if (src0->type == WSP_GGML_TYPE_F32  && dst->type == WSP_GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == WSP_GGML_TYPE_F16  && dst->type == WSP_GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, wsp_ggml_fp16_t, wsp_ggml_fp16_t>(params, dst);
+    } else if (src0->type == WSP_GGML_TYPE_BF16 && dst->type == WSP_GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, wsp_ggml_bf16_t, wsp_ggml_bf16_t>(params, dst);
+    } else if (src0->type == WSP_GGML_TYPE_BF16 && dst->type == WSP_GGML_TYPE_F32) {
+        apply_unary_op<op, wsp_ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == WSP_GGML_TYPE_F16  && dst->type == WSP_GGML_TYPE_F32) {
+        apply_unary_op<op, wsp_ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            wsp_ggml_type_name(dst->type), wsp_ggml_type_name(src0->type));
+        WSP_GGML_ABORT("fatal error");
+    }
+}
+// Extend vec_unary_op to support functors
+template <typename Op, typename src0_t, typename dst_t>
+static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+// Extend apply_unary_op to support functors
+template <typename Op, typename src0_t, typename dst_t>
+static void apply_unary_op_functor(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst, Op op) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous_1(src0) && wsp_ggml_is_contiguous_1(dst) && wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_TENSOR_UNARY_OP_LOCALS
+    WSP_GGML_ASSERT( nb0 == sizeof(dst_t));
+    WSP_GGML_ASSERT(nb00 == sizeof(src0_t));
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+        vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
+    }
+}
+// Generic dispatcher for functors
+template <typename Op>
+static void unary_op_functor(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst, Op op) {
+    const wsp_ggml_tensor * src0 = dst->src[0];
+    /*  */ if (src0->type == WSP_GGML_TYPE_F32  && dst->type == WSP_GGML_TYPE_F32) { // all f32
+        apply_unary_op_functor<Op, float, float>(params, dst, op);
+    } else if (src0->type == WSP_GGML_TYPE_F16  && dst->type == WSP_GGML_TYPE_F16) { // all f16
+        apply_unary_op_functor<Op, wsp_ggml_fp16_t, wsp_ggml_fp16_t>(params, dst, op);
+    } else if (src0->type == WSP_GGML_TYPE_BF16 && dst->type == WSP_GGML_TYPE_BF16) { // all bf16
+        apply_unary_op_functor<Op, wsp_ggml_bf16_t, wsp_ggml_bf16_t>(params, dst, op);
+    } else if (src0->type == WSP_GGML_TYPE_BF16 && dst->type == WSP_GGML_TYPE_F32) {
+        apply_unary_op_functor<Op, wsp_ggml_bf16_t, float>(params, dst, op);
+    } else if (src0->type == WSP_GGML_TYPE_F16  && dst->type == WSP_GGML_TYPE_F32) {
+        apply_unary_op_functor<Op, wsp_ggml_fp16_t, float>(params, dst, op);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            wsp_ggml_type_name(dst->type), wsp_ggml_type_name(src0->type));
+        WSP_GGML_ABORT("fatal error");
+    }
+}
 void wsp_ggml_compute_forward_abs(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
     unary_op<op_abs>(params, dst);
 }
@@ -184,3 +289,33 @@ void wsp_ggml_compute_forward_cos(const wsp_ggml_compute_params * params, wsp_gg
 void wsp_ggml_compute_forward_log(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
     unary_op<op_log>(params, dst);
 }
+void wsp_ggml_compute_forward_floor(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    unary_op<op_floor>(params, dst);
+}
+void wsp_ggml_compute_forward_ceil(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    unary_op<op_ceil>(params, dst);
+}
+void wsp_ggml_compute_forward_round(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    unary_op<op_round>(params, dst);
+}
+void wsp_ggml_compute_forward_trunc(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    unary_op<op_trunc>(params, dst);
+}
+void wsp_ggml_compute_forward_xielu(const wsp_ggml_compute_params * params, wsp_ggml_tensor * dst) {
+    const float alpha_n = wsp_ggml_get_op_params_f32(dst, 1);
+    const float alpha_p = wsp_ggml_get_op_params_f32(dst, 2);
+    const float beta = wsp_ggml_get_op_params_f32(dst, 3);
+    const float eps = wsp_ggml_get_op_params_f32(dst, 4);
+    const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
+        return op_xielu(f, alpha_n, alpha_p, beta, eps);
+    };
+    unary_op_functor(params, dst, xielu_op_params);
+}

package/cpp/ggml-cpu/unary-ops.h CHANGED Viewed

@@ -22,6 +22,11 @@ void wsp_ggml_compute_forward_sqrt(const struct wsp_ggml_compute_params * params
 void wsp_ggml_compute_forward_sin(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_cos(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 void wsp_ggml_compute_forward_log(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_floor(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_ceil(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_round(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_trunc(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
+void wsp_ggml_compute_forward_xielu(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
 #ifdef __cplusplus
 }

package/cpp/ggml-cpu/vec.cpp CHANGED Viewed

@@ -404,6 +404,72 @@ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const floa
     }
 }
+wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
+    int i = 0;
+    wsp_ggml_float sum = 0;
+// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
+// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                   _mm512_set1_ps(mean));
+        _mm512_storeu_ps(y + i, val);
+        sum += (wsp_ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                   _mm256_set1_ps(mean));
+        _mm256_storeu_ps(y + i, val);
+        val = _mm256_mul_ps(val,val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (wsp_ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
+                                _mm_set1_ps(mean));
+        _mm_storeu_ps(y + i, val);
+        val = _mm_mul_ps(val, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif  // __AVX__ || __AVX2__ || __AVX512F__
+        sum += (wsp_ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = vsubq_f32(vld1q_f32(x + i),
+                                    vdupq_n_f32(mean));
+        vst1q_f32(y + i, val);
+        val = vmulq_f32(val, val);
+        sum += (wsp_ggml_float)vaddvq_f32(val);
+    }
+#elif defined(__VXE__) || defined(__VXE2__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
+        vec_xst(val, 0, y + i);
+        val = vec_mul(val, val);
+        sum += (wsp_ggml_float)vec_hsum_f32x4(val);
+    }
+#endif
+    for (; i < n; ++i) {
+        float val = x[i] - mean;
+        y[i] = val;
+        val *= val;
+        sum += (wsp_ggml_float)val;
+    }
+    return sum/n;
+}
 wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
     int i = 0;
     wsp_ggml_float sum = 0;

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -44,6 +44,7 @@ void wsp_ggml_vec_dot_bf16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_gg
 void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, size_t bx, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, size_t by, int nrc);
 void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x);
+wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
 wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
 wsp_ggml_float wsp_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
@@ -143,14 +144,14 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
         for (int i = 0; i < np; i += wsp_ggml_f16_step) {
             ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0); // 8 elements
-            ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elemnst
+            ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
             sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
             ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
             sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
             ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1); // next 8 elements
-            ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 ekements
+            ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 elements
             sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
             ax2 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 1*wsp_ggml_f16_epr, 1);
             sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
@@ -159,7 +160,7 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
             ax3 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 2*wsp_ggml_f16_epr, 2);
             sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
-            ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
+            ax3 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
             sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
             ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
@@ -654,11 +655,11 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
         }
         // leftovers
         // maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
-        if (np < n) {
-            svbool_t pg = svwhilelt_b32(np, n);
-            ay1 = svld1_f32(pg, y + np);
+        for (int i = np; i < n; i += wsp_ggml_f32_epr) {
+            svbool_t pg = svwhilelt_b32(i, n);
+            ay1 = svld1_f32(pg, y + i);
             ay1 = svmul_f32_m(pg, ay1, vx);
-            svst1_f32(pg, y + np, ay1);
+            svst1_f32(pg, y + i, ay1);
         }
     #elif defined(__riscv_v_intrinsic)
         for (int i = 0, avl; i < n; i += avl) {
@@ -819,7 +820,8 @@ inline static void wsp_ggml_vec_tanh_f16 (const int n, wsp_ggml_fp16_t * y, cons
 inline static void wsp_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void wsp_ggml_vec_elu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = WSP_GGML_CPU_FP32_TO_FP16(expm1f(WSP_GGML_CPU_FP16_TO_FP32(x[i])));
+        const float v = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
     }
 }
 inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }